OpenTTD-patches/src/blitter/32bpp_sse_func.hpp

/* $Id$ */

/*
 * This file is part of OpenTTD.
 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
 */

/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */

#ifndef BLITTER_32BPP_SSE_BASE_HPP
#define BLITTER_32BPP_SSE_BASE_HPP

#ifdef WITH_SSE

#include "32bpp_simple.hpp"
#if (SSE_VERSION == 2)
#include <emmintrin.h>
#elif (SSE_VERSION == 3)
#include <tmmintrin.h>
#elif (SSE_VERSION == 4)
#include <smmintrin.h>
#endif

#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.
#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.
#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.

#ifdef _MSC_VER
	#define ALIGN(n) __declspec(align(n))
#else
	#define ALIGN(n) __attribute__ ((aligned (n)))
#endif

typedef union ALIGN(16) um128i {
	__m128i m128i;
	uint8 m128i_u8[16];
	uint16 m128i_u16[8];
	uint32 m128i_u32[4];
	uint64 m128i_u64[2];
} um128i;

#define CLEAR_HIGH_BYTE_MASK        _mm_setr_epi8(-1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0)
#define ALPHA_CONTROL_MASK          _mm_setr_epi8( 6,  7,  6,  7,  6,  7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)
#define PACK_LOW_CONTROL_MASK       _mm_setr_epi8( 0,  2,  4, -1,  8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)
#define PACK_HIGH_CONTROL_MASK      _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,  0,  2,  4, -1,  8, 10, 12, -1)
#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1,  2,  1,  2,  1,  2,  0,  2,  3,  2,  3,  2,  3,  2,  0,  2)
#define BRIGHTNESS_DIV_CLEANER      _mm_setr_epi8(-1,  1, -1,  1, -1,  1, -1,  0, -1,  1, -1,  1, -1,  1, -1,  0)
#define OVERBRIGHT_PRESENCE_MASK    _mm_setr_epi8( 1,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0)
#define OVERBRIGHT_VALUE_MASK       _mm_setr_epi8(-1,  0, -1,  0, -1,  0,  0,  0, -1,  0, -1,  0, -1,  0,  0,  0)
#define OVERBRIGHT_CONTROL_MASK     _mm_setr_epi8( 0,  1,  0,  1,  0,  1,  7,  7,  2,  3,  2,  3,  2,  3,  7,  7)
#define TRANSPARENT_NOM_BASE        _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)

static inline void InsertFirstUint32(const uint32 value, __m128i &into)
{
#if (SSE_VERSION >= 4)
	into = _mm_insert_epi32(into, value, 0);
#else
	NOT_REACHED();
#endif
}

static inline void InsertSecondUint32(const uint32 value, __m128i &into)
{
#if (SSE_VERSION >= 4)
	into = _mm_insert_epi32(into, value, 1);
#else
	into = _mm_insert_epi16(into, value, 2);
	into = _mm_insert_epi16(into, value >> 16, 3);
#endif
}

static inline void LoadUint64(const uint64 value, __m128i &into)
{
#ifdef _SQ64
	into = _mm_cvtsi64_si128(value);
#else
	#if (SSE_VERSION >= 4)
		into = _mm_cvtsi32_si128(value);
		InsertSecondUint32(value >> 32, into);
	#else
		(*(um128i*) &into).m128i_u64[0] = value;
	#endif
#endif
}

static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)
{
#if (SSE_VERSION == 2)
	from = _mm_and_si128(from, mask);    // PAND, wipe high bytes to keep low bytes when packing
	return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)
#else
	return _mm_shuffle_epi8(from, mask);
#endif
}

static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)
{
#if (SSE_VERSION == 2)
	__m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1
	return _mm_shufflehi_epi16(alphaAB, 0x3F);         // PSHUFHW, put alpha2 in front of each rgb2
#else
	return _mm_shuffle_epi8(from, mask);
#endif
}

static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)
{
	__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());   // PUNPCKLBW, expand each uint8 into uint16
	__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

	__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;
	alphaAB = _mm_srli_epi16(alphaAB, 15);
	alphaAB = _mm_add_epi16(alphaAB, srcAB);
	alphaAB = DistributeAlpha(alphaAB, distribution_mask);

	srcAB = _mm_sub_epi16(srcAB, dstAB);     // PSUBW,    (r - Cr)
	srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)
	srcAB = _mm_srli_epi16(srcAB, 8);        // PSRLW,  a*(r - Cr)/256
	srcAB = _mm_add_epi16(srcAB, dstAB);     // PADDW,  a*(r - Cr)/256 + Cr
	return PackUnsaturated(srcAB, pack_mask);
}

/* Darken 2 pixels.
 * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
 */
static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)
{
	__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());
	__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
	__m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);
	alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
	__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);
	dstAB = _mm_mullo_epi16(dstAB, nom);
	dstAB = _mm_srli_epi16(dstAB, 8);
	return _mm_packus_epi16(dstAB, dstAB);
}

IGNORE_UNINITIALIZED_WARNING_START
static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)
{
	uint64 c16 = colour.b | (uint64) colour.g << 16 | (uint64) colour.r << 32;
	c16 *= brightness;
	uint64 c16_ob = c16; // Helps out of order execution.
	c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;
	c16 &= 0x01FF01FF01FF;

	/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
	c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;
	const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;

	const uint32 alpha32 = colour.data & 0xFF000000;
	__m128i ret;
	LoadUint64(c16, ret);
	if (ob != 0) {
		__m128i ob128 = _mm_cvtsi32_si128(ob);
		ob128 = _mm_shufflelo_epi16(ob128, 0xC0);
		__m128i white = OVERBRIGHT_VALUE_MASK;
		__m128i c128 = ret;
		ret = _mm_subs_epu16(white, c128); // PSUBUSW,   (255 - rgb)
		ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)
		ret = _mm_srli_epi16(ret, 8);      // PSRLW,  ob*(255 - rgb)/256
		ret = _mm_add_epi16(ret, c128);    // PADDW,  ob*(255 - rgb)/256 + rgb
	}

	ret = _mm_packus_epi16(ret, ret);      // PACKUSWB, saturate and pack.
	return alpha32 | _mm_cvtsi128_si32(ret);
}
IGNORE_UNINITIALIZED_WARNING_STOP

/** ReallyAdjustBrightness() is not called that often.
 * Inlining this function implies a far jump, which has a huge latency.
 */
static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)
{
	/* Shortcut for normal brightness. */
	if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;

	return ReallyAdjustBrightness(colour, brightness);
}

static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)
{
#if (SSE_VERSION < 3)
	NOT_REACHED();
#else
	/* The following dataflow differs from the one of AdjustBrightness() only for alpha.
	 * In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).
	 * OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.
	 */
	brightness &= 0xFF00FF00;
	brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;

	__m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());
	__m128i briAB = _mm_cvtsi32_si128(brightness);
	briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.
	colAB = _mm_mullo_epi16(colAB, briAB);
	__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);
	colAB = _mm_srli_epi16(colAB, 7);

	/* Sum overbright.
	 * Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.
	 * -255 is changed in -256 so we just have to take the 8 lower bits into account.
	 */
	colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);
	colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);
	colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);
	colAB_ob = _mm_and_si128(colAB_ob, colAB);
	__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());

	obAB = _mm_srli_epi16(obAB, 1);        // Reduce overbright strength.
	obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);
	__m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.
	retAB = _mm_subs_epu16(retAB, colAB);  //    (255 - rgb)
	retAB = _mm_mullo_epi16(retAB, obAB);  // ob*(255 - rgb)
	retAB = _mm_srli_epi16(retAB, 8);      // ob*(255 - rgb)/256
	retAB = _mm_add_epi16(retAB, colAB);   // ob*(255 - rgb)/256 + rgb

	return _mm_packus_epi16(retAB, retAB);
#endif
}

#endif /* WITH_SSE */
#endif /* BLITTER_32BPP_SSE_BASE_HPP */
(svn r26257) -Codechange: replace most of the SSE macros by functions 11 years ago			`/* $Id$ */`

			`/*`
			`* This file is part of OpenTTD.`
			`* OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.`
			`* OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.`
			`* See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`/** @file 32bpp_sse_base.hpp Functions related to SSE 32 bpp blitter. */`

			`#ifndef BLITTER_32BPP_SSE_BASE_HPP`
			`#define BLITTER_32BPP_SSE_BASE_HPP`

			`#ifdef WITH_SSE`

			`#include "32bpp_simple.hpp"`
			`#if (SSE_VERSION == 2)`
			`#include <emmintrin.h>`
			`#elif (SSE_VERSION == 3)`
			`#include <tmmintrin.h>`
			`#elif (SSE_VERSION == 4)`
			`#include <smmintrin.h>`
			`#endif`

			`#define META_LENGTH 2 ///< Number of uint32 inserted before each line of pixels in a sprite.`
			`#define MARGIN_NORMAL_THRESHOLD (zoom == ZOOM_LVL_OUT_32X ? 8 : 4) ///< Minimum width to use margins with BM_NORMAL.`
			`#define MARGIN_REMAP_THRESHOLD 4 ///< Minimum width to use margins with BM_COLOUR_REMAP.`

			`#ifdef _MSC_VER`
			`#define ALIGN(n) __declspec(align(n))`
			`#else`
			`#define ALIGN(n) __attribute__ ((aligned (n)))`
			`#endif`

			`typedef union ALIGN(16) um128i {`
			`__m128i m128i;`
			`uint8 m128i_u8[16];`
			`uint16 m128i_u16[8];`
			`uint32 m128i_u32[4];`
			`uint64 m128i_u64[2];`
			`} um128i;`

			`#define CLEAR_HIGH_BYTE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0)`
			`#define ALPHA_CONTROL_MASK _mm_setr_epi8( 6, 7, 6, 7, 6, 7, -1, -1, 14, 15, 14, 15, 14, 15, -1, -1)`
			`#define PACK_LOW_CONTROL_MASK _mm_setr_epi8( 0, 2, 4, -1, 8, 10, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1)`
			`#define PACK_HIGH_CONTROL_MASK _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, -1, 8, 10, 12, -1)`
			`#define BRIGHTNESS_LOW_CONTROL_MASK _mm_setr_epi8( 1, 2, 1, 2, 1, 2, 0, 2, 3, 2, 3, 2, 3, 2, 0, 2)`
			`#define BRIGHTNESS_DIV_CLEANER _mm_setr_epi8(-1, 1, -1, 1, -1, 1, -1, 0, -1, 1, -1, 1, -1, 1, -1, 0)`
			`#define OVERBRIGHT_PRESENCE_MASK _mm_setr_epi8( 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0)`
			`#define OVERBRIGHT_VALUE_MASK _mm_setr_epi8(-1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0)`
			`#define OVERBRIGHT_CONTROL_MASK _mm_setr_epi8( 0, 1, 0, 1, 0, 1, 7, 7, 2, 3, 2, 3, 2, 3, 7, 7)`
			`#define TRANSPARENT_NOM_BASE _mm_setr_epi16(256, 256, 256, 256, 256, 256, 256, 256)`

			`static inline void InsertFirstUint32(const uint32 value, __m128i &into)`
			`{`
			`#if (SSE_VERSION >= 4)`
			`into = _mm_insert_epi32(into, value, 0);`
			`#else`
			`NOT_REACHED();`
			`#endif`
			`}`

			`static inline void InsertSecondUint32(const uint32 value, __m128i &into)`
			`{`
			`#if (SSE_VERSION >= 4)`
			`into = _mm_insert_epi32(into, value, 1);`
			`#else`
			`into = _mm_insert_epi16(into, value, 2);`
			`into = _mm_insert_epi16(into, value >> 16, 3);`
			`#endif`
			`}`

			`static inline void LoadUint64(const uint64 value, __m128i &into)`
			`{`
			`#ifdef _SQ64`
			`into = _mm_cvtsi64_si128(value);`
			`#else`
			`#if (SSE_VERSION >= 4)`
			`into = _mm_cvtsi32_si128(value);`
			`InsertSecondUint32(value >> 32, into);`
			`#else`
			`((um128i) &into).m128i_u64[0] = value;`
			`#endif`
			`#endif`
			`}`

			`static inline __m128i PackUnsaturated(__m128i from, const __m128i &mask)`
			`{`
			`#if (SSE_VERSION == 2)`
			`from = _mm_and_si128(from, mask); // PAND, wipe high bytes to keep low bytes when packing`
			`return _mm_packus_epi16(from, from); // PACKUSWB, pack 2 colours (with saturation)`
			`#else`
			`return _mm_shuffle_epi8(from, mask);`
			`#endif`
			`}`

			`static inline __m128i DistributeAlpha(const __m128i from, const __m128i &mask)`
			`{`
			`#if (SSE_VERSION == 2)`
			`__m128i alphaAB = _mm_shufflelo_epi16(from, 0x3F); // PSHUFLW, put alpha1 in front of each rgb1`
			`return _mm_shufflehi_epi16(alphaAB, 0x3F); // PSHUFHW, put alpha2 in front of each rgb2`
			`#else`
			`return _mm_shuffle_epi8(from, mask);`
			`#endif`
			`}`

			`static inline __m128i AlphaBlendTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &pack_mask)`
			`{`
			`__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128()); // PUNPCKLBW, expand each uint8 into uint16`
			`__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());`

			`__m128i alphaAB = _mm_cmpgt_epi16(srcAB, _mm_setzero_si128()); // PCMPGTW, if (alpha > 0) a++;`
			`alphaAB = _mm_srli_epi16(alphaAB, 15);`
			`alphaAB = _mm_add_epi16(alphaAB, srcAB);`
			`alphaAB = DistributeAlpha(alphaAB, distribution_mask);`

			`srcAB = _mm_sub_epi16(srcAB, dstAB); // PSUBW, (r - Cr)`
			`srcAB = _mm_mullo_epi16(srcAB, alphaAB); // PMULLW, a*(r - Cr)`
			`srcAB = _mm_srli_epi16(srcAB, 8); // PSRLW, a*(r - Cr)/256`
			`srcAB = _mm_add_epi16(srcAB, dstAB); // PADDW, a*(r - Cr)/256 + Cr`
			`return PackUnsaturated(srcAB, pack_mask);`
			`}`

			`/* Darken 2 pixels.`
			`* rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)`
			`*/`
			`static inline __m128i DarkenTwoPixels(__m128i src, __m128i dst, const __m128i &distribution_mask, const __m128i &tr_nom_base)`
			`{`
			`__m128i srcAB = _mm_unpacklo_epi8(src, _mm_setzero_si128());`
			`__m128i dstAB = _mm_unpacklo_epi8(dst, _mm_setzero_si128());`
			`__m128i alphaAB = DistributeAlpha(srcAB, distribution_mask);`
			`alphaAB = _mm_srli_epi16(alphaAB, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.`
			`__m128i nom = _mm_sub_epi16(tr_nom_base, alphaAB);`
			`dstAB = _mm_mullo_epi16(dstAB, nom);`
			`dstAB = _mm_srli_epi16(dstAB, 8);`
			`return _mm_packus_epi16(dstAB, dstAB);`
			`}`

			`IGNORE_UNINITIALIZED_WARNING_START`
			`static Colour ReallyAdjustBrightness(Colour colour, uint8 brightness)`
			`{`
			`uint64 c16 = colour.b \| (uint64) colour.g << 16 \| (uint64) colour.r << 32;`
			`c16 *= brightness;`
			`uint64 c16_ob = c16; // Helps out of order execution.`
			`c16 /= Blitter_32bppBase::DEFAULT_BRIGHTNESS;`
			`c16 &= 0x01FF01FF01FF;`

			`/* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */`
			`c16_ob = (((c16_ob >> (8 + 7)) & 0x0100010001) * 0xFF) & c16;`
			`const uint ob = ((uint16) c16_ob + (uint16) (c16_ob >> 16) + (uint16) (c16_ob >> 32)) / 2;`

			`const uint32 alpha32 = colour.data & 0xFF000000;`
			`__m128i ret;`
			`LoadUint64(c16, ret);`
			`if (ob != 0) {`
			`__m128i ob128 = _mm_cvtsi32_si128(ob);`
			`ob128 = _mm_shufflelo_epi16(ob128, 0xC0);`
			`__m128i white = OVERBRIGHT_VALUE_MASK;`
			`__m128i c128 = ret;`
			`ret = _mm_subs_epu16(white, c128); // PSUBUSW, (255 - rgb)`
			`ret = _mm_mullo_epi16(ret, ob128); // PMULLW, ob*(255 - rgb)`
			`ret = _mm_srli_epi16(ret, 8); // PSRLW, ob*(255 - rgb)/256`
			`ret = _mm_add_epi16(ret, c128); // PADDW, ob*(255 - rgb)/256 + rgb`
			`}`

			`ret = _mm_packus_epi16(ret, ret); // PACKUSWB, saturate and pack.`
			`return alpha32 \| _mm_cvtsi128_si32(ret);`
			`}`
			`IGNORE_UNINITIALIZED_WARNING_STOP`

			`/** ReallyAdjustBrightness() is not called that often.`
			`* Inlining this function implies a far jump, which has a huge latency.`
			`*/`
			`static inline Colour AdjustBrightneSSE(Colour colour, uint8 brightness)`
			`{`
			`/* Shortcut for normal brightness. */`
			`if (brightness == Blitter_32bppBase::DEFAULT_BRIGHTNESS) return colour;`

			`return ReallyAdjustBrightness(colour, brightness);`
			`}`

			`static inline __m128i AdjustBrightnessOfTwoPixels(__m128i from, uint32 brightness)`
			`{`
			`#if (SSE_VERSION < 3)`
			`NOT_REACHED();`
			`#else`
			`/* The following dataflow differs from the one of AdjustBrightness() only for alpha.`
			`* In order to keep alpha in colAB, insert a 1 in a unused brightness byte (a*1->a).`
			`* OK, not a 1 but DEFAULT_BRIGHTNESS to compensate the div.`
			`*/`
			`brightness &= 0xFF00FF00;`
			`brightness += Blitter_32bppBase::DEFAULT_BRIGHTNESS;`

			`__m128i colAB = _mm_unpacklo_epi8(from, _mm_setzero_si128());`
			`__m128i briAB = _mm_cvtsi32_si128(brightness);`
			`briAB = _mm_shuffle_epi8(briAB, BRIGHTNESS_LOW_CONTROL_MASK); // DEFAULT_BRIGHTNESS in 0, 0x00 in 2.`
			`colAB = _mm_mullo_epi16(colAB, briAB);`
			`__m128i colAB_ob = _mm_srli_epi16(colAB, 8+7);`
			`colAB = _mm_srli_epi16(colAB, 7);`

			`/* Sum overbright.`
			`* Maximum for each rgb is 508 => 9 bits. The highest bit tells if there is overbright.`
			`* -255 is changed in -256 so we just have to take the 8 lower bits into account.`
			`*/`
			`colAB = _mm_and_si128(colAB, BRIGHTNESS_DIV_CLEANER);`
			`colAB_ob = _mm_and_si128(colAB_ob, OVERBRIGHT_PRESENCE_MASK);`
			`colAB_ob = _mm_mullo_epi16(colAB_ob, OVERBRIGHT_VALUE_MASK);`
			`colAB_ob = _mm_and_si128(colAB_ob, colAB);`
			`__m128i obAB = _mm_hadd_epi16(_mm_hadd_epi16(colAB_ob, _mm_setzero_si128()), _mm_setzero_si128());`

			`obAB = _mm_srli_epi16(obAB, 1); // Reduce overbright strength.`
			`obAB = _mm_shuffle_epi8(obAB, OVERBRIGHT_CONTROL_MASK);`
			`__m128i retAB = OVERBRIGHT_VALUE_MASK; // ob_mask is equal to white.`
			`retAB = _mm_subs_epu16(retAB, colAB); // (255 - rgb)`
			`retAB = _mm_mullo_epi16(retAB, obAB); // ob*(255 - rgb)`
			`retAB = _mm_srli_epi16(retAB, 8); // ob*(255 - rgb)/256`
			`retAB = _mm_add_epi16(retAB, colAB); // ob*(255 - rgb)/256 + rgb`

			`return _mm_packus_epi16(retAB, retAB);`
			`#endif`
			`}`

			`#endif /* WITH_SSE */`
			`#endif /* BLITTER_32BPP_SSE_BASE_HPP */`