(svn r26257) -Codechange: replace most of the SSE macros by functions

replace/41b28d7194a279bdc17475d4fbe2ea6ec885a466
rubidium 11 years ago
parent 3a44e22b1a
commit f5f4f8a4b3

@ -1123,6 +1123,7 @@
<ClInclude Include="..\src\blitter\32bpp_optimized.hpp" />
<ClCompile Include="..\src\blitter\32bpp_simple.cpp" />
<ClInclude Include="..\src\blitter\32bpp_simple.hpp" />
<ClInclude Include="..\src\blitter\32bpp_sse_func.hpp" />
<ClCompile Include="..\src\blitter\32bpp_sse2.cpp" />
<ClInclude Include="..\src\blitter\32bpp_sse2.hpp" />
<ClCompile Include="..\src\blitter\32bpp_sse4.cpp" />

@ -2598,6 +2598,9 @@
<ClInclude Include="..\src\blitter\32bpp_simple.hpp">
<Filter>Blitters</Filter>
</ClInclude>
<ClInclude Include="..\src\blitter\32bpp_sse_func.hpp">
<Filter>Blitters</Filter>
</ClInclude>
<ClCompile Include="..\src\blitter\32bpp_sse2.cpp">
<Filter>Blitters</Filter>
</ClCompile>

@ -3834,6 +3834,10 @@
RelativePath=".\..\src\blitter\32bpp_simple.hpp"
>
</File>
<File
RelativePath=".\..\src\blitter\32bpp_sse_func.hpp"
>
</File>
<File
RelativePath=".\..\src\blitter\32bpp_sse2.cpp"
>

@ -3831,6 +3831,10 @@
RelativePath=".\..\src\blitter\32bpp_simple.hpp"
>
</File>
<File
RelativePath=".\..\src\blitter\32bpp_sse_func.hpp"
>
</File>
<File
RelativePath=".\..\src\blitter\32bpp_sse2.cpp"
>

@ -912,6 +912,7 @@ blitter/32bpp_optimized.hpp
blitter/32bpp_simple.cpp
blitter/32bpp_simple.hpp
#if SSE
blitter/32bpp_sse_func.hpp
blitter/32bpp_sse2.cpp
blitter/32bpp_sse2.hpp
blitter/32bpp_sse4.cpp

@ -83,12 +83,12 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
const byte m0 = mvX2;
if (m0 >= PALETTE_ANIM_START) {
const Colour c0 = (this->LookupColourInPalette(m0).data & 0x00FFFFFF) | (src[0].data & 0xFF000000);
INSR32(AdjustBrightness(c0, (byte) (mvX2 >> 8)).data, srcABCD, 0);
InsertFirstUint32(AdjustBrightneSSE(c0, (byte) (mvX2 >> 8)).data, srcABCD);
}
const byte m1 = mvX2 >> 16;
if (m1 >= PALETTE_ANIM_START) {
const Colour c1 = (this->LookupColourInPalette(m1).data & 0x00FFFFFF) | (src[1].data & 0xFF000000);
INSR32(AdjustBrightness(c1, (byte) (mvX2 >> 24)).data, srcABCD, 1);
InsertSecondUint32(AdjustBrightneSSE(c1, (byte) (mvX2 >> 24)).data, srcABCD);
}
/* Update anim buffer. */
@ -118,7 +118,7 @@ inline void Blitter_32bppSSE4_Anim::Draw(const Blitter::BlitterParams *bp, ZoomL
/* Blend colours. */
bmno_alpha_blend:
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
bmno_full_opacity:
_mm_storel_epi64((__m128i *) dst, srcABCD);
bmno_full_transparency:
@ -132,20 +132,19 @@ bmno_full_transparency:
if (src->a == 0) {
} else if (src->a == 255) {
*anim = *(const uint16*) src_mv;
*dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
*dst = (src_mv->m >= PALETTE_ANIM_START) ? AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v) : *src;
} else {
*anim = 0;
__m128i srcABCD;
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
if (src_mv->m >= PALETTE_ANIM_START) {
Colour colour = AdjustBrightness(LookupColourInPalette(src_mv->m), src_mv->v);
Colour colour = AdjustBrightneSSE(LookupColourInPalette(src_mv->m), src_mv->v);
colour.a = src->a;
srcABCD = _mm_cvtsi32_si128(colour.data);
} else {
srcABCD = _mm_cvtsi32_si128(src->data);
}
ALPHA_BLEND_2();
dst->data = _mm_cvtsi128_si32(srcABCD);
dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
}
break;
@ -162,24 +161,36 @@ bmno_full_transparency:
const uint m1 = (byte) (mvX2 >> 16);
const uint r1 = remap[m1];
if (mvX2 & 0x00FF00FF) {
/* Written so the compiler uses CMOV. */
const Colour src0 = src[0];
const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
Colour c0 = dst[0];
c0 = r0 == 0 ? c0 : c0map;
c0 = m0 != 0 ? c0 : src0;
srcABCD = _mm_cvtsi32_si128(c0.data);
const Colour src1 = src[1];
const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
Colour c1 = dst[1];
c1 = r1 == 0 ? c1 : c1map;
c1 = m1 != 0 ? c1 : src1;
INSR32(c1.data, srcABCD, 1);
#define CMOV_REMAP(m_colour, m_colour_init, m_src, m_m) \
/* Written so the compiler uses CMOV. */ \
Colour m_colour = m_colour_init; \
{ \
const Colour srcm = (Colour) (m_src); \
const uint m = (byte) (m_m); \
const uint r = remap[m]; \
const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
m_colour = r == 0 ? m_colour : cmap; \
m_colour = m != 0 ? m_colour : srcm; \
}
#ifdef _SQ64
uint64 srcs = _mm_cvtsi128_si64(srcABCD);
uint64 dsts = _mm_cvtsi128_si64(dstABCD);
uint64 remapped_src = 0;
CMOV_REMAP(c0, dsts, srcs, mvX2);
remapped_src = c0.data;
CMOV_REMAP(c1, dsts >> 32, srcs >> 32, mvX2 >> 16);
remapped_src |= (uint64) c1.data << 32;
srcABCD = _mm_cvtsi64_si128(remapped_src);
#else
Colour remapped_src[2];
CMOV_REMAP(c0, _mm_cvtsi128_si32(dstABCD), _mm_cvtsi128_si32(srcABCD), mvX2);
remapped_src[0] = c0.data;
CMOV_REMAP(c1, dst[1], src[1], mvX2 >> 16);
remapped_src[1] = c1.data;
srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
#endif
if ((mvX2 & 0xFF00FF00) != 0x80008000) {
ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
}
if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Update anim buffer. */
@ -211,7 +222,7 @@ bmno_full_transparency:
/* Blend colours. */
bmcr_alpha_blend:
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
bmcr_full_opacity:
_mm_storel_epi64((__m128i *) dst, srcABCD);
bmcr_full_transparency:
@ -229,7 +240,7 @@ bmcr_full_transparency:
const uint r = remap[src_mv->m];
*anim = (src->a == 255) ? r | ((uint16) src_mv->v << 8 ) : 0;
if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@ -244,7 +255,7 @@ bmcr_full_transparency:
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@ -256,8 +267,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
DARKEN_2();
_mm_storel_epi64((__m128i *) dst, dstAB);
_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
anim += 2;
@ -268,8 +278,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
DARKEN_2();
dst->data = _mm_cvtsi128_si32(dstAB);
dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
if (src[0].a) anim[0] = 0;
}
break;
@ -318,13 +327,4 @@ void Blitter_32bppSSE4_Anim::Draw(Blitter::BlitterParams *bp, BlitterMode mode,
}
}
/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
inline Colour Blitter_32bppSSE4_Anim::AdjustBrightness(Colour colour, uint8 brightness)
{
/* Shortcut for normal brightness. */
if (brightness == DEFAULT_BRIGHTNESS) return colour;
return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
}
#endif /* WITH_SSE */

@ -14,6 +14,9 @@
#ifdef WITH_SSE
#ifndef SSE_VERSION
#define SSE_VERSION 4
#endif
#include "32bpp_anim.hpp"
#include "32bpp_sse4.hpp"
@ -28,11 +31,9 @@ public:
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
/* virtual */ void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
/* virtual */ Colour AdjustBrightness(Colour colour, uint8 brightness);
/* virtual */ Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator) {
return Blitter_32bppSSE_Base::Encode(sprite, allocator);
}
/* virtual */ const char *GetName() { return "32bpp-sse4-anim"; }
};

@ -73,8 +73,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
ALPHA_BLEND_2();
_mm_storel_epi64((__m128i*) dst, srcABCD);
_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
src += 2;
dst += 2;
}
@ -82,8 +81,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
dst->data = _mm_cvtsi128_si32(srcABCD);
dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi));
}
break;
@ -94,7 +92,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@ -108,7 +106,7 @@ inline void Blitter_32bppSSE2::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, clear_hi, clear_hi);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@ -123,8 +121,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
DARKEN_2();
_mm_storel_epi64((__m128i *) dst, dstAB);
_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
src += 2;
dst += 2;
}
@ -132,8 +129,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
DARKEN_2();
dst->data = _mm_cvtsi128_si32(dstAB);
dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, tr_nom_base, tr_nom_base));
}
break;
}
@ -235,7 +231,7 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
dst_mv->v = (rgb_max == 0) ? Blitter_32bppBase::DEFAULT_BRIGHTNESS : rgb_max;
/* Pre-convert the mapping channel to a RGB value. */
const Colour colour = AdjustBrightness(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
const Colour colour = AdjustBrightneSSE(Blitter_32bppBase::LookupColourInPalette(src->m), dst_mv->v);
dst_rgba->r = colour.r;
dst_rgba->g = colour.g;
dst_rgba->b = colour.b;
@ -282,47 +278,4 @@ Sprite *Blitter_32bppSSE_Base::Encode(const SpriteLoader::Sprite *sprite, Alloca
return dst_sprite;
}
/** ReallyAdjustBrightness() is not called that often.
* Inlining this function implies a far jump, which has a huge latency.
*/
inline Colour Blitter_32bppSSE2::AdjustBrightness(Colour colour, uint8 brightness)
{
/* Shortcut for normal brightness. */
if (brightness == DEFAULT_BRIGHTNESS) return colour;
return Blitter_32bppSSE2::ReallyAdjustBrightness(colour, brightness);
}
IGNORE_UNINITIALIZED_WARNING_START
Colour Blitter_32bppSSE2::ReallyAdjustBrightness(Colour colour, uint8 brightness)
{
uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
c16 *= brightness;
uint64 c16_ob = c16; // Helps out of order execution.
c16 /= DEFAULT_BRIGHTNESS;
c16 &= 0x01FF01FF01FF;
/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
const uint32 alpha32 = colour.data & 0xFF000000;
__m128i ret;
LOAD64(c16, ret);
if (ob != 0) {
__m128i ob128 = _mm_cvtsi32_si128(ob);
ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
__m128i white = OVERBRIGHT_VALUE_MASK;
__m128i c128 = ret;
ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */
ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */
ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */
}
ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
return alpha32 | _mm_cvtsi128_si32(ret);
}
IGNORE_UNINITIALIZED_WARNING_STOP
#endif /* WITH_SSE */

@ -14,91 +14,10 @@
#ifdef WITH_SSE
#include "32bpp_simple.hpp"
#include "emmintrin.h"
#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
#ifdef _MSC_VER
#define ALIGN(n) __declspec(align(n))
#else
#define ALIGN(n) __attribute__ ((aligned (n)))
#endif
typedef union ALIGN(16) um128i {
__m128i m128i;
uint8 m128i_u8[16];
uint16 m128i_u16[8];
uint32 m128i_u32[4];
uint64 m128i_u64[2];
} um128i;
#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)
#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)
#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)
#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)
#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)
#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)
#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)
#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
#define EXTR32(m_from, m_rank) (*(um128i*) &m_from).m128i_u32[m_rank]
#define EXTR64(m_from, m_rank) (*(um128i*) &m_from).m128i_u64[m_rank]
#define INSR32(m_val, m_into, m_rank) { \
(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, m_val, (m_rank)*2); \
(*(um128i*) &m_into).m128i = _mm_insert_epi16((*(um128i*) &m_into).m128i, (m_val) >> 16, (m_rank)*2 + 1); \
}
#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i_u64[m_rank] = (m_val)
#ifdef _SQ64
#define LOAD64(m_val, m_into) m_into = _mm_cvtsi64_si128(m_val);
#else
#define LOAD64(m_val, m_into) INSR64(m_val, m_into, 0)
#ifndef SSE_VERSION
#define SSE_VERSION 2
#endif
/* PUT_ALPHA_IN_FRONT_OF_RGB is redefined in 32bpp_ssse3.hpp. */
#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) \
m_into = _mm_shufflelo_epi16(m_from, 0x3F); /* PSHUFLW, put alpha1 in front of each rgb1 */ \
m_into = _mm_shufflehi_epi16(m_into, 0x3F); /* PSHUFHW, put alpha2 in front of each rgb2 */
/* PACK_AB_WITHOUT_SATURATION is redefined in 32bpp_ssse3.hpp. */
#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) \
m_from = _mm_and_si128(m_from, clear_hi); /* PAND, wipe high bytes to keep low bytes when packing */ \
m_into = _mm_packus_epi16(m_from, m_from); /* PACKUSWB, pack 2 colours (with saturation) */
/* Alpha blend 2 pixels. */
#define ALPHA_BLEND_2() { \
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); /* PUNPCKLBW, expand each uint8 into uint16 */ \
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
\
__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); /* PCMPGTW, if (alpha > 0) a++; */ \
alphaAB = _mm_srli_epi16(alphaAB, 15); \
alphaAB = _mm_add_epi16(alphaAB, srcAB); \
PUT_ALPHA_IN_FRONT_OF_RGB(alphaAB, alphaAB); \
\
srcAB = _mm_sub_epi16(srcAB, dstAB); /* PSUBW, (r - Cr) */ \
srcAB = _mm_mullo_epi16(srcAB, alphaAB); /* PMULLW, a*(r - Cr) */ \
srcAB = _mm_srli_epi16(srcAB, 8); /* PSRLW, a*(r - Cr)/256 */ \
srcAB = _mm_add_epi16(srcAB, dstAB); /* PADDW, a*(r - Cr)/256 + Cr */ \
PACK_AB_WITHOUT_SATURATION(srcAB, srcABCD); \
}
/* Darken 2 pixels.
* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
*/
#define DARKEN_2() \
__m128i srcAB = _mm_unpacklo_epi8(srcABCD, _mm_setzero_si128()); \
__m128i dstAB = _mm_unpacklo_epi8(dstABCD, _mm_setzero_si128()); \
__m128i PUT_ALPHA_IN_FRONT_OF_RGB(srcAB, alphaAB); \
alphaAB = _mm_srli_epi16(alphaAB, 2); /* Reduce to 64 levels of shades so the max value fits in 16 bits. */ \
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB); \
dstAB = _mm_mullo_epi16(dstAB, nom); \
dstAB = _mm_srli_epi16(dstAB, 8); \
dstAB = _mm_packus_epi16(dstAB, dstAB);
#include "32bpp_sse_func.hpp"
/** Base methods for 32bpp SSE blitters. */
class Blitter_32bppSSE_Base {
@ -138,14 +57,11 @@ public:
};
Sprite *Encode(const SpriteLoader::Sprite *sprite, AllocatorProc *allocator);
virtual Colour AdjustBrightness(Colour colour, uint8 brightness) = 0;
};
/** The SSE2 32 bpp blitter (without palette animation). */
class Blitter_32bppSSE2 : public Blitter_32bppSimple, public Blitter_32bppSSE_Base {
public:
virtual Colour AdjustBrightness(Colour colour, uint8 brightness);
static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);

@ -74,8 +74,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
ALPHA_BLEND_2();
_mm_storel_epi64((__m128i*) dst, srcABCD);
_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
src += 2;
dst += 2;
}
@ -83,8 +82,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
dst->data = _mm_cvtsi128_si32(srcABCD);
dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
break;
@ -96,33 +94,39 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Remap colours. */
if (mvX2 & 0x00FF00FF) {
/* Written so the compiler uses CMOV. */
const Colour src0 = src[0];
const uint m0 = (byte) mvX2;
const uint r0 = remap[m0];
const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
Colour c0 = 0; // Use alpha of 0 to keep dst as is.
c0 = r0 == 0 ? c0 : c0map;
c0 = m0 != 0 ? c0 : src0;
srcABCD = _mm_cvtsi32_si128(c0.data);
const Colour src1 = src[1];
const uint m1 = (byte) (mvX2 >> 16);
const uint r1 = remap[m1];
const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
Colour c1 = 0;
c1 = r1 == 0 ? c1 : c1map;
c1 = m1 != 0 ? c1 : src1;
INSR32(c1.data, srcABCD, 1);
if ((mvX2 & 0xFF00FF00) != 0x80008000) {
ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
}
#define CMOV_REMAP(m_colour, m_src, m_m) \
/* Written so the compiler uses CMOV. */ \
Colour m_colour = 0; \
{ \
const Colour srcm = (Colour) (m_src); \
const uint m = (byte) (m_m); \
const uint r = remap[m]; \
const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
m_colour = r == 0 ? m_colour : cmap; \
m_colour = m != 0 ? m_colour : srcm; \
}
#ifdef _SQ64
uint64 srcs = _mm_cvtsi128_si64(srcABCD);
uint64 remapped_src = 0;
CMOV_REMAP(c0, srcs, mvX2);
remapped_src = c0.data;
CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
remapped_src |= (uint64) c1.data << 32;
srcABCD = _mm_cvtsi64_si128(remapped_src);
#else
Colour remapped_src[2];
CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
remapped_src[0] = c0.data;
CMOV_REMAP(c1, src[1], mvX2 >> 16);
remapped_src[1] = c1.data;
srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
#endif
if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Blend colours. */
ALPHA_BLEND_2();
_mm_storel_epi64((__m128i *) dst, srcABCD);
_mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
dst += 2;
src += 2;
src_mv += 2;
@ -134,7 +138,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@ -148,7 +152,7 @@ inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
DARKEN_2();
_mm_storel_epi64((__m128i *) dst, dstAB);
_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
}
@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
DARKEN_2();
dst->data = _mm_cvtsi128_si32(dstAB);
dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
}
break;
}
@ -217,45 +219,4 @@ void Blitter_32bppSSE4::Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomL
}
}
/** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
inline Colour Blitter_32bppSSE4::AdjustBrightness(Colour colour, uint8 brightness)
{
/* Shortcut for normal brightness. */
if (brightness == DEFAULT_BRIGHTNESS) return colour;
return Blitter_32bppSSE4::ReallyAdjustBrightness(colour, brightness);
}
IGNORE_UNINITIALIZED_WARNING_START
Colour Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour, uint8 brightness)
{
uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
c16 *= brightness;
uint64 c16_ob = c16; // Helps out of order execution.
c16 /= DEFAULT_BRIGHTNESS;
c16 &= 0x01FF01FF01FF;
/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
const uint32 alpha32 = colour.data & 0xFF000000;
__m128i ret;
LOAD64(c16, ret);
if (ob != 0) {
__m128i ob128 = _mm_cvtsi32_si128(ob);
ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
__m128i white = OVERBRIGHT_VALUE_MASK;
__m128i c128 = ret;
ret = _mm_subs_epu16(white, c128); /* PSUBUSW, (255 - rgb) */
ret = _mm_mullo_epi16(ret, ob128); /* PMULLW, ob*(255 - rgb) */
ret = _mm_srli_epi16(ret, 8); /* PSRLW, ob*(255 - rgb)/256 */
ret = _mm_add_epi16(ret, c128); /* PADDW, ob*(255 - rgb)/256 + rgb */
}
ret = _mm_packus_epi16(ret, ret); /* PACKUSWB, saturate and pack. */
return alpha32 | _mm_cvtsi128_si32(ret);
}
IGNORE_UNINITIALIZED_WARNING_STOP
#endif /* WITH_SSE */

@ -14,41 +14,14 @@
#ifdef WITH_SSE
#include "32bpp_ssse3.hpp"
#include "smmintrin.h"
#undef EXTR32
#define EXTR32(m_from, m_rank) _mm_extract_epi32((*(um128i*) &m_from).m128i, m_rank)
#undef INSR32
#define INSR32(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, m_val, m_rank)
IGNORE_UNINITIALIZED_WARNING_START
#ifdef _SQ64
#undef INSR64
#define INSR64(m_val, m_into, m_rank) (*(um128i*) &m_into).m128i = _mm_insert_epi64((*(um128i*) &m_into).m128i, m_val, m_rank)
#else
typedef union { uint64 u64; struct _u32 { uint32 low, high; } u32; } u6432;
#undef INSR64
#define INSR64(m_val, m_into, m_rank) { \
u6432 v; \
v.u64 = m_val; \
(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.low, (m_rank)*2); \
(*(um128i*) &m_into).m128i = _mm_insert_epi32((*(um128i*) &m_into).m128i, v.u32.high, (m_rank)*2 + 1); \
}
#undef LOAD64
#define LOAD64(m_val, m_into) \
m_into = _mm_cvtsi32_si128(m_val); \
INSR32((m_val) >> 32, m_into, 1);
#ifndef SSE_VERSION
#define SSE_VERSION 4
#endif
IGNORE_UNINITIALIZED_WARNING_STOP
#include "32bpp_ssse3.hpp"
/** The SSE4 32 bpp blitter (without palette animation). */
class Blitter_32bppSSE4 : public Blitter_32bppSSSE3 {
public:
Colour AdjustBrightness(Colour colour, uint8 brightness);
static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness);
/* virtual */ void Draw(Blitter::BlitterParams *bp, BlitterMode mode, ZoomLevel zoom);
template <BlitterMode mode, Blitter_32bppSSE_Base::ReadMode read_mode, Blitter_32bppSSE_Base::BlockType bt_last>
void Draw(const Blitter::BlitterParams *bp, ZoomLevel zoom);

@ -0,0 +1,225 @@
/* $Id$ */
/*
* This file is part of OpenTTD.
* OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
* OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
*/
/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */
#ifndef BLITTER_32BPP_SSE_BASE_HPP
#define BLITTER_32BPP_SSE_BASE_HPP
#ifdef WITH_SSE
#include "32bpp_simple.hpp"
#if (SSE_VERSION == 2)
#include <emmintrin.h>
#elif (SSE_VERSION == 3)
#include <tmmintrin.h>
#elif (SSE_VERSION == 4)
#include <smmintrin.h>
#endif
#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.
#ifdef _MSC_VER
#define ALIGN(n) __declspec(align(n))
#else
#define ALIGN(n) __attribute__ ((aligned (n)))
#endif
typedef union ALIGN(16) um128i {
__m128i m128i;
uint8 m128i_u8[16];
uint16 m128i_u16[8];
uint32 m128i_u32[4];
uint64 m128i_u64[2];
} um128i;
#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)
#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)
#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)
#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)
#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)
#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)
#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)
#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)
static inline void InsertFirstUint32(const uint32 value, __m128i &into)
{
#if (SSE_VERSION >= 4)
into = _mm_insert_epi32(into, value, 0);
#else
NOT_REACHED();
#endif
}
static inline void InsertSecondUint32(const uint32 value, __m128i &into)
{
#if (SSE_VERSION >= 4)
into = _mm_insert_epi32(into, value, 1);
#else
into = _mm_insert_epi16(into, value, 2);
into = _mm_insert_epi16(into, value >> 16, 3);
#endif
}
static inline void LoadUint64(const uint64 value, __m128i &into)
{
#ifdef _SQ64
into = _mm_cvtsi64_si128(value);
#else
#if (SSE_VERSION >= 4)
into = _mm_cvtsi32_si128(value);
InsertSecondUint32(value >> 32, into);
#else
(*(um128i*) &into).m128i_u64[0] = value;
#endif
#endif
}
static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)
{
#if (SSE_VERSION == 2)
from = _mm_and_si128(from, mask); // PAND, wipe high bytes to keep low bytes when packing
return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)
#else
return _mm_shuffle_epi8(from, mask);
#endif
}
static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)
{
#if (SSE_VERSION == 2)
__m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1
return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2
#else
return _mm_shuffle_epi8(from, mask);
#endif
}
static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)
{
__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16
__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;
alphaAB = _mm_srli_epi16(alphaAB, 15);
alphaAB = _mm_add_epi16(alphaAB, srcAB);
alphaAB = DistributeAlpha(alphaAB, distribution_mask);
srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr)
srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)
srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256
srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr
return PackUnsaturated(srcAB, pack_mask);
}
/* Darken 2 pixels.
* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
*/
static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)
{
__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
__m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
dstAB = _mm_mullo_epi16(dstAB, nom);
dstAB = _mm_srli_epi16(dstAB, 8);
return _mm_packus_epi16(dstAB, dstAB);
}
IGNORE_UNINITIALIZED_WARNING_START
static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)
{
uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
c16 *= brightness;
uint64 c16_ob = c16; // Helps out of order execution.
c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
c16 &= 0x01FF01FF01FF;
/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;
const uint32 alpha32 = colour.data & 0xFF000000;
__m128i ret;
LoadUint64(c16, ret);
if (ob != 0) {
__m128i ob128 = _mm_cvtsi32_si128(ob);
ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
__m128i white = OVERBRIGHT_VALUE_MASK;
__m128i c128 = ret;
ret = _mm_subs_epu16(white, c128); // PSUBUSW, (255 - rgb)
ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)
ret = _mm_srli_epi16(ret, 8); // PSRLW, ob*(255 - rgb)/256
ret = _mm_add_epi16(ret, c128); // PADDW, ob*(255 - rgb)/256 + rgb
}
ret = _mm_packus_epi16(ret, ret); // PACKUSWB, saturate and pack.
return alpha32 | _mm_cvtsi128_si32(ret);
}
IGNORE_UNINITIALIZED_WARNING_STOP
/** ReallyAdjustBrightness() is not called that often.
* Inlining this function implies a far jump, which has a huge latency.
*/
static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)
{
/* Shortcut for normal brightness. */
if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;
return ReallyAdjustBrightness(colour, brightness);
}
static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)
{
#if (SSE_VERSION < 3)
NOT_REACHED();
#else
/* The following dataflow differs from the one of AdjustBrightness() only for alpha.
* In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
* OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
*/
brightness &= 0xFF00FF00;
brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;
__m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
__m128i briAB = _mm_cvtsi32_si128(brightness);
briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.
colAB = _mm_mullo_epi16(colAB, briAB);
__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);
colAB = _mm_srli_epi16(colAB, 7);
/* Sum overbright.
* Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
* -255 is changed in -256 so we just have to take the 8 lower bits into account.
*/
colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
colAB_ob = _mm_and_si128(colAB_ob, colAB);
__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());
obAB = _mm_srli_epi16(obAB, 1); // Reduce overbright strength.
obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
__m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.
retAB = _mm_subs_epu16(retAB, colAB); // (255 - rgb)
retAB = _mm_mullo_epi16(retAB, obAB); // ob*(255 - rgb)
retAB = _mm_srli_epi16(retAB, 8); // ob*(255 - rgb)/256
retAB = _mm_add_epi16(retAB, colAB); // ob*(255 - rgb)/256 + rgb
return _mm_packus_epi16(retAB, retAB);
#endif
}
#endif /* WITH_SSE */
#endif /* BLITTER_32BPP_SSE_BASE_HPP */

@ -74,8 +74,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
for (uint x = (uint) effective_width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
ALPHA_BLEND_2();
_mm_storel_epi64((__m128i*) dst, srcABCD);
_mm_storel_epi64((__m128i*) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
src += 2;
dst += 2;
}
@ -83,8 +82,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if ((bt_last == BT_NONE && effective_width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
dst->data = _mm_cvtsi128_si32(srcABCD);
dst->data = _mm_cvtsi128_si32(AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
}
break;
@ -96,33 +94,39 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
/* Remap colours. */
if (mvX2 & 0x00FF00FF) {
/* Written so the compiler uses CMOV. */
const Colour src0 = src[0];
const uint m0 = (byte) mvX2;
const uint r0 = remap[m0];
const Colour c0map = (this->LookupColourInPalette(r0).data & 0x00FFFFFF) | (src0.data & 0xFF000000);
Colour c0 = 0; // Use alpha of 0 to keep dst as is.
c0 = r0 == 0 ? c0 : c0map;
c0 = m0 != 0 ? c0 : src0;
srcABCD = _mm_cvtsi32_si128(c0.data);
const Colour src1 = src[1];
const uint m1 = (byte) (mvX2 >> 16);
const uint r1 = remap[m1];
const Colour c1map = (this->LookupColourInPalette(r1).data & 0x00FFFFFF) | (src1.data & 0xFF000000);
Colour c1 = 0;
c1 = r1 == 0 ? c1 : c1map;
c1 = m1 != 0 ? c1 : src1;
INSR32(c1.data, srcABCD, 1);
if ((mvX2 & 0xFF00FF00) != 0x80008000) {
ADJUST_BRIGHTNESS_2(srcABCD, mvX2);
}
#define CMOV_REMAP(m_colour, m_src, m_m) \
/* Written so the compiler uses CMOV. */ \
Colour m_colour = 0; \
{ \
const Colour srcm = (Colour) (m_src); \
const uint m = (byte) (m_m); \
const uint r = remap[m]; \
const Colour cmap = (this->LookupColourInPalette(r).data & 0x00FFFFFF) | (srcm.data & 0xFF000000); \
m_colour = r == 0 ? m_colour : cmap; \
m_colour = m != 0 ? m_colour : srcm; \
}
#ifdef _SQ64
uint64 srcs = _mm_cvtsi128_si64(srcABCD);
uint64 remapped_src = 0;
CMOV_REMAP(c0, srcs, mvX2);
remapped_src = c0.data;
CMOV_REMAP(c1, srcs >> 32, mvX2 >> 16);
remapped_src |= (uint64) c1.data << 32;
srcABCD = _mm_cvtsi64_si128(remapped_src);
#else
Colour remapped_src[2];
CMOV_REMAP(c0, _mm_cvtsi128_si32(srcABCD), mvX2);
remapped_src[0] = c0.data;
CMOV_REMAP(c1, src[1], mvX2 >> 16);
remapped_src[1] = c1.data;
srcABCD = _mm_loadl_epi64((__m128i*) &remapped_src);
#endif
if ((mvX2 & 0xFF00FF00) != 0x80008000) srcABCD = AdjustBrightnessOfTwoPixels(srcABCD, mvX2);
}
/* Blend colours. */
ALPHA_BLEND_2();
_mm_storel_epi64((__m128i *) dst, srcABCD);
_mm_storel_epi64((__m128i *) dst, AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm));
dst += 2;
src += 2;
src_mv += 2;
@ -134,7 +138,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src_mv->m) {
const uint r = remap[src_mv->m];
if (r != 0) {
Colour remapped_colour = AdjustBrightness(this->LookupColourInPalette(r), src_mv->v);
Colour remapped_colour = AdjustBrightneSSE(this->LookupColourInPalette(r), src_mv->v);
if (src->a == 255) {
*dst = remapped_colour;
} else {
@ -148,7 +152,7 @@ inline void Blitter_32bppSSSE3::Draw(const Blitter::BlitterParams *bp, ZoomLevel
if (src->a < 255) {
bmcr_alpha_blend_single:
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
ALPHA_BLEND_2();
srcABCD = AlphaBlendTwoPixels(srcABCD, dstABCD, a_cm, pack_low_cm);
}
dst->data = _mm_cvtsi128_si32(srcABCD);
}
@ -160,8 +164,7 @@ bmcr_alpha_blend_single:
for (uint x = (uint) bp->width / 2; x > 0; x--) {
__m128i srcABCD = _mm_loadl_epi64((const __m128i*) src);
__m128i dstABCD = _mm_loadl_epi64((__m128i*) dst);
DARKEN_2();
_mm_storel_epi64((__m128i *) dst, dstAB);
_mm_storel_epi64((__m128i *) dst, DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
src += 2;
dst += 2;
}
@ -169,8 +172,7 @@ bmcr_alpha_blend_single:
if ((bt_last == BT_NONE && bp->width & 1) || bt_last == BT_ODD) {
__m128i srcABCD = _mm_cvtsi32_si128(src->data);
__m128i dstABCD = _mm_cvtsi32_si128(dst->data);
DARKEN_2();
dst->data = _mm_cvtsi128_si32(dstAB);
dst->data = _mm_cvtsi128_si32(DarkenTwoPixels(srcABCD, dstABCD, a_cm, tr_nom_base));
}
break;
}

@ -14,53 +14,10 @@
#ifdef WITH_SSE
#ifndef SSE_VERSION
#define SSE_VERSION 3
#endif
#include "32bpp_sse2.hpp"
#include "tmmintrin.h"
/* Use PSHUFB instead of PSHUFHW+PSHUFLW. */
#undef PUT_ALPHA_IN_FRONT_OF_RGB
#define PUT_ALPHA_IN_FRONT_OF_RGB(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, a_cm);
#undef PACK_AB_WITHOUT_SATURATION
#define PACK_AB_WITHOUT_SATURATION(m_from, m_into) m_into = _mm_shuffle_epi8(m_from, pack_low_cm);
/* Adjust brightness of 2 pixels. */
#define ADJUST_BRIGHTNESS_2(m_colourX2, m_brightnessX2) \
/* The following dataflow differs from the one of AdjustBrightness() only for alpha.
* In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
* OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
*/ \
m_brightnessX2 &= 0xFF00FF00; \
m_brightnessX2 += DEFAULT_BRIGHTNESS; \
\
__m128i zero = _mm_setzero_si128(); \
__m128i colAB = _mm_unpacklo_epi8(m_colourX2, zero); \
\
__m128i briAB = _mm_cvtsi32_si128(m_brightnessX2); \
briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); /* DEFAULT_BRIGHTNESS in 0, 0x00 in 2. */ \
colAB = _mm_mullo_epi16(colAB, briAB); \
__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7); \
colAB = _mm_srli_epi16(colAB, 7); \
\
/* Sum overbright.
* Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
* -255 is changed in -256 so we just have to take the 8 lower bits into account.
*/ \
colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER); \
colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK); \
colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK); \
colAB_ob = _mm_and_si128(colAB_ob, colAB); \
__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, zero), zero); \
\
obAB = _mm_srli_epi16(obAB, 1); /* Reduce overbright strength. */ \
obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK); \
__m128i retAB = OVERBRIGHT_VALUE_MASK; /* ob_mask is equal to white. */ \
retAB = _mm_subs_epu16(retAB, colAB); /* (255 - rgb) */ \
retAB = _mm_mullo_epi16(retAB, obAB); /* ob*(255 - rgb) */ \
retAB = _mm_srli_epi16(retAB, 8); /* ob*(255 - rgb)/256 */ \
retAB = _mm_add_epi16(retAB, colAB); /* ob*(255 - rgb)/256 + rgb */ \
\
m_colourX2 = _mm_packus_epi16(retAB, retAB);
/** The SSSE3 32 bpp blitter (without palette animation). */
class Blitter_32bppSSSE3 : public Blitter_32bppSSE2 {

Loading…
Cancel
Save