@ -8,12 +8,12 @@
# define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
# define MULSTEP_gcc(j,h0,h1,h2,h3,h4) \
gj = g [ j ] ; \
gj = g [ j ] ; \
h0 + = f0 * gj ; \
h0 + = f0 * gj ; \
_mm256_storeu_ps ( & h [ i + j ] , h0 ) ; \
_mm256_storeu_ps ( ( float * ) & h [ i + j ] , h0 ) ; \
h1 + = f1 * gj ; \
h1 + = f1 * gj ; \
h2 + = f2 * gj ; \
h2 + = f2 * gj ; \
h3 + = f3 * gj ; \
h3 + = f3 * gj ; \
h4 + = f4 * gj ; \
h4 + = f4 * gj ; \
h0 = _mm256_loadu_ps ( & h [ i + j + 5 ] ) ; \
h0 = _mm256_loadu_ps ( ( float * ) & h [ i + j + 5 ] ) ; \
h0 + = f5 * gj ;
h0 + = f5 * gj ;
# define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
# define MULSTEP_asm(j,h0,h1,h2,h3,h4) \
@ -30,9 +30,19 @@
: " +x " ( h0 ) , " +x " ( h1 ) , " +x " ( h2 ) , " +x " ( h3 ) , " +x " ( h4 ) \
: " +x " ( h0 ) , " +x " ( h1 ) , " +x " ( h2 ) , " +x " ( h3 ) , " +x " ( h4 ) \
: " x " ( gj ) , " x " ( f0 ) , " x " ( f1 ) , " x " ( f2 ) , " x " ( f3 ) , " x " ( f4 ) , " x " ( f5 ) , " m " ( h [ i + j ] ) , " m " ( h [ i + j + 5 ] ) ) ;
: " x " ( gj ) , " x " ( f0 ) , " x " ( f1 ) , " x " ( f2 ) , " x " ( f3 ) , " x " ( f4 ) , " x " ( f5 ) , " m " ( h [ i + j ] ) , " m " ( h [ i + j + 5 ] ) ) ;
# define MULSTEP MULSTEP_ asm
# define MULSTEP MULSTEP_ gcc
# define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
# define MULSTEP_noload(j,h0,h1,h2,h3,h4) \
gj = g [ j ] ; \
h0 + = gj * f0 ; \
_mm256_storeu_ps ( ( float * ) & h [ i + j ] , h0 ) ; \
h1 + = gj * f1 ; \
h2 + = gj * f2 ; \
h3 + = gj * f3 ; \
h4 + = gj * f4 ; \
h0 = gj * f5 ;
# define MULSTEP_noload_asm(j,h0,h1,h2,h3,h4) \
gj = g [ j ] ; \
gj = g [ j ] ; \
__asm__ ( \
__asm__ ( \
" vfmadd231ps %5,%6,%0 \n \t " \
" vfmadd231ps %5,%6,%0 \n \t " \
@ -46,6 +56,16 @@
: " x " ( gj ) , " x " ( f0 ) , " x " ( f1 ) , " x " ( f2 ) , " x " ( f3 ) , " x " ( f4 ) , " x " ( f5 ) , " m " ( h [ i + j ] ) ) ;
: " x " ( gj ) , " x " ( f0 ) , " x " ( f1 ) , " x " ( f2 ) , " x " ( f3 ) , " x " ( f4 ) , " x " ( f5 ) , " m " ( h [ i + j ] ) ) ;
# define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
# define MULSTEP_fromzero(j,h0,h1,h2,h3,h4) \
gj = g [ j ] ; \
h0 = gj * f0 ; \
_mm256_storeu_ps ( ( float * ) & h [ i + j ] , h0 ) ; \
h1 = gj * f1 ; \
h2 = gj * f2 ; \
h3 = gj * f3 ; \
h4 = gj * f4 ; \
h0 = gj * f5 ;
# define MULSTEP_fromzero_asm(j,h0,h1,h2,h3,h4) \
gj = g [ j ] ; \
gj = g [ j ] ; \
__asm__ ( \
__asm__ ( \
" vmulps %5,%6,%0 \n \t " \
" vmulps %5,%6,%0 \n \t " \