`
jgsj
  • 浏览: 1051115 次
文章分类
社区版块
存档分类
最新评论

AVX Intrinsics各函数介绍

 
阅读更多

SIMD相关头文件包括:

//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)

#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)
mmintrin.h为MMX 头文件,其中__m64的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{
    unsigned __int64    m64_u64;
    float               m64_f32[2];
    __int8              m64_i8[8];
    __int16             m64_i16[4];
    __int32             m64_i32[2];    
    __int64             m64_i64;
    unsigned __int8     m64_u8[8];
    unsigned __int16    m64_u16[4];
    unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
     float               m128_f32[4];
     unsigned __int64    m128_u64[2];
     __int8              m128_i8[16];
     __int16             m128_i16[8];
     __int32             m128_i32[4];
     __int64             m128_i64[2];
     unsigned __int8     m128_u8[16];
     unsigned __int16    m128_u16[8];
     unsigned __int32    m128_u32[4];
 } __m128;

emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {
    __int8              m128i_i8[16];
    __int16             m128i_i16[8];
    __int32             m128i_i32[4];    
    __int64             m128i_i64[2];
    unsigned __int8     m128i_u8[16];
    unsigned __int16    m128i_u16[8];
    unsigned __int32    m128i_u32[4];
    unsigned __int64    m128i_u64[2];
} __m128i;

typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {
    double              m128d_f64[2];
} __m128d;

immintrin.h为AVX头文件,此头文件里包含AES头文件,其中__m256、__m256d、__m256i的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 { 
    float m256_f32[8];
} __m256;
    
typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {
    double m256d_f64[4]; 
} __m256d;

typedef union  __declspec(intrin_type) _CRT_ALIGN(32) __m256i {
    __int8              m256i_i8[32];
    __int16             m256i_i16[16];
    __int32             m256i_i32[8];
    __int64             m256i_i64[4];
    unsigned __int8     m256i_u8[32];
    unsigned __int16    m256i_u16[16];
    unsigned __int32    m256i_u32[8];
    unsigned __int64    m256i_u64[4];
} __m256i;

immintrin.h文件中各函数的介绍:

/*
* Add Packed Double Precision Floating-Point Values
* **** VADDPD ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the four packed double-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10+m20, r1=m11+m21, r2=m12+m22, r3=m13+m23
extern __m256d __cdecl _mm256_add_pd(__m256d m1, __m256d m2);

/*
* Add Packed Single Precision Floating-Point Values
* **** VADDPS ymm1, ymm2, ymm3/m256
* Performs an SIMD add of the eight packed single-precision floating-point
* values from the first source operand to the second source operand, and
* stores the packed single-precision floating-point results in the
* destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10+m20, r1=m11+m21, ..., r7=m17+m27
extern __m256 __cdecl _mm256_add_ps(__m256 m1, __m256 m2);

/*
* Add/Subtract Double Precision Floating-Point Values
* **** VADDSUBPD ymm1, ymm2, ymm3/m256
* Adds odd-numbered double-precision floating-point values of the first
* source operand with the corresponding double-precision floating-point
* values from the second source operand; stores the result in the odd-numbered
* values of the destination. Subtracts the even-numbered double-precision
* floating-point values from the second source operand from the corresponding
* double-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m20, r1=m11+m21, r2=m12-m22, r3=m13-m23
extern __m256d __cdecl _mm256_addsub_pd(__m256d m1, __m256d m2);

/*
* Add/Subtract Packed Single Precision Floating-Point Values
* **** VADDSUBPS ymm1, ymm2, ymm3/m256
* Adds odd-numbered single-precision floating-point values of the first source
* operand with the corresponding single-precision floating-point values from
* the second source operand; stores the result in the odd-numbered values of
* the destination. Subtracts the even-numbered single-precision floating-point
* values from the second source operand from the corresponding
* single-precision floating values in the first source operand; stores the
* result into the even-numbered values of the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=m10-m20, r1=m11+m21, ... , r6=m16-m26, r7=m17+m27
extern __m256 __cdecl _mm256_addsub_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical AND of Packed Double Precision Floating-Point Values
* **** VANDPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=(m10 & m20), r1=(m11 & m21), r2=(m12 & m22), r3=(m13 & m23)
extern __m256d __cdecl _mm256_and_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical AND of Packed Single Precision Floating-Point Values
* **** VANDPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=(m10 & m20), r1=(m11 & m21), ..., r6=(m16 & m26), r7=(m17 & m27)
extern __m256 __cdecl _mm256_and_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical AND NOT of Packed Double Precision Floating-Point Values
* **** VANDNPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the four packed double-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=(~m10) & m20, r1=(~m11) & m21, r2=(~m12) & m22, r3=(~m13) & m23
extern __m256d __cdecl _mm256_andnot_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical AND NOT of Packed Single Precision Floating-Point Values
* **** VANDNPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical AND NOT of the eight packed single-precision
* floating-point values from the first source operand and the second source
* operand, and stores the result in the destination
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=(~m10) & m20, r1=(~m11) & m21), ..., r6=(~m16) & m26, r7=(~m17) & m27
extern __m256 __cdecl _mm256_andnot_ps(__m256 m1, __m256 m2);

/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDPD ymm1, ymm2, ymm3/m256, imm8
* Double-Precision Floating-Point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [3:0] determine whether the
* corresponding Double-Precision Floating Point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* orresponding to a word, is "1", then the Double-Precision Floating-Point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23), mask=[b3 b2 b1 b0]
//如果bn=1,则rn=m2n,如果bn=0, 则rn=m1n, 其中n为0,1,2,3
extern __m256d __cdecl _mm256_blend_pd(__m256d m1, __m256d m2, const int mask);

/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDPS ymm1, ymm2, ymm3/m256, imm8
* Single precision floating point values from the second source operand are
* conditionally merged with values from the first source operand and written
* to the destination. The immediate bits [7:0] determine whether the
* corresponding single precision floating-point value in the destination is
* copied from the second source or first source. If a bit in the mask,
* corresponding to a word, is "1", then the single-precision floating-point
* value in the second source operand is copied, else the value in the first
* source operand is copied
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27),mask=[b7 b6...b1 b0]
//如果bn=1,则rn=m2n,如果bn=0, 则rn=m1n, 其中n为0,1,2,3,4,5,6,7
extern __m256 __cdecl _mm256_blend_ps(__m256 m1, __m256 m2, const int mask);

/*
* Blend Packed Double Precision Floating-Point Values
* **** VBLENDVPD ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each quadword data element of double-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256d __cdecl _mm256_blendv_pd(__m256d m1, __m256d m2, __m256d m3);

/*
* Blend Packed Single Precision Floating-Point Values
* **** VBLENDVPS ymm1, ymm2, ymm3/m256, ymm4
* Conditionally copy each dword data element of single-precision
* floating-point value from the second source operand (third operand) and the
* first source operand (second operand) depending on mask bits defined in the
* mask register operand (fourth operand).
*/
extern __m256 __cdecl _mm256_blendv_ps(__m256 m1, __m256 m2, __m256 mask);

/*
* Divide Packed Double-Precision Floating-Point Values
* **** VDIVPD ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the four packed double-precision floating-point
* values in the first source operand by the four packed double-precision
* floating-point values in the second source operand
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10/m20, r1=m11/m21, r2=m12/m22, r3=m13/m23
extern __m256d __cdecl _mm256_div_pd(__m256d m1, __m256d m2);

/* 
* Divide Packed Single-Precision Floating-Point Values
* **** VDIVPS ymm1, ymm2, ymm3/m256
* Performs an SIMD divide of the eight packed single-precision
* floating-point values in the first source operand by the eight packed
* single-precision floating-point values in the second source operand
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//则r0=m10/m20, r1=m11/m21, ..., r6=m16/m26, r7=m17/m27
extern __m256 __cdecl _mm256_div_ps(__m256 m1, __m256 m2);

/*
* Dot Product of Packed Single-Precision Floating-Point Values
* **** VDPPS ymm1, ymm2, ymm3/m256, imm8
* Multiplies the packed single precision floating point values in the
* first source operand with the packed single-precision floats in the
* second source. Each of the four resulting single-precision values is
* conditionally summed depending on a mask extracted from the high 4 bits
* of the immediate operand. This sum is broadcast to each of 4 positions
* in the destination if the corresponding bit of the mask selected from
* the low 4 bits of the immediate operand is "1". If the corresponding
* low bit 0-3 of the mask is zero, the destination is set to zero.
* The process is replicated for the high elements of the destination.
*/
//m1=(m10, m11, m12, m13, ..., m17), m2=(m20, m21, m22, m23, ..., m27)
//mask=[b7 b6 ... b0], mask的低四位决定结果值是0,还是m1和m2相应位相乘后再求和
//若b0b1b2b3为0001,则r0=r1=r2=0,m4=m5=m6=0,此时如果b4b5b6b7为1001,
//则r3=m10*m20+m13*m23, r7=m14*m24+m17*m27,其它依次类推
extern __m256 __cdecl _mm256_dp_ps(__m256 m1, __m256 m2, const int mask);

/*
* Add Horizontal Double Precision Floating-Point Values
* **** VHADDPD ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent double-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10+m11, r1=m20+m21, r2=m12+m13, r3=m22+m23
extern __m256d __cdecl _mm256_hadd_pd(__m256d m1, __m256d m2);

/*
* Add Horizontal Single Precision Floating-Point Values
* **** VHADDPS ymm1, ymm2, ymm3/m256
* Adds pairs of adjacent single-precision floating-point values in the
* first source operand and second source operand and stores results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10+m11, r1=m12+m13, r2=m20+m21, r3=m22+m23, 
//r4=m14+m15, r5=m16+m17, r6=m24+m25, r7=m26+m27
extern __m256 __cdecl _mm256_hadd_ps(__m256 m1, __m256 m2);

/*
* Subtract Horizontal Double Precision Floating-Point Values
* **** VHSUBPD ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent double-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m11, r1=m20-m21, r2=m12-m13, r3=m22-m23
extern __m256d __cdecl _mm256_hsub_pd(__m256d m1, __m256d m2);

/*
* Subtract Horizontal Single Precision Floating-Point Values
* **** VHSUBPS ymm1, ymm2, ymm3/m256
* Subtract pairs of adjacent single-precision floating-point values in
* the first source operand and second source operand and stores results
* in the destination.
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10-m11, r1=m12-m13, r2=m20-m21, r3=m22-m23, 
//r4=m14-m15, r5=m16-m17, r6=m24-m25, r7=m26-m27
extern __m256 __cdecl _mm256_hsub_ps(__m256 m1, __m256 m2);

/*
* Maximum of Packed Double Precision Floating-Point Values
* **** VMAXPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=max(m10,m20), r1=max(m11,m21), r2=max(m12,m22), r3=max(m13,m23)
extern __m256d __cdecl _mm256_max_pd(__m256d m1, __m256d m2);

/*
* Maximum of Packed Single Precision Floating-Point Values
* **** VMAXPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the maximum value for each pair of values to the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=max(m10,m20), r1=max(m11,m21), ..., r6=max(m16,m26), r7=max(m17,m27) 
extern __m256 __cdecl _mm256_max_ps(__m256 m1, __m256 m2);

/*
* Minimum of Packed Double Precision Floating-Point Values
* **** VMINPD ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed double-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=min(m10,m20), r1=min(m11,m21), r2=min(m12,m22), r3=min(m13,m23)
extern __m256d __cdecl _mm256_min_pd(__m256d m1, __m256d m2);

/*
* Minimum of Packed Single Precision Floating-Point Values
* **** VMINPS ymm1, ymm2, ymm3/m256
* Performs an SIMD compare of the packed single-precision floating-point
* values in the first source operand and the second source operand and
* returns the minimum value for each pair of values to the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=min(m10,m20), r1=min(m11,m21), ..., r6=min(m16,m26), r7=min(m17,m27) 
extern __m256 __cdecl _mm256_min_ps(__m256 m1, __m256 m2);

/*
* Multiply Packed Double Precision Floating-Point Values
* **** VMULPD ymm1, ymm2, ymm3/m256
* Performs a SIMD multiply of the four packed double-precision floating-point
* values from the first Source operand to the Second Source operand, and
* stores the packed double-precision floating-point results in the
* destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10*m20, r1=m11*m21, r2=m12*m22, r3=m13*m23
extern __m256d __cdecl _mm256_mul_pd(__m256d m1, __m256d m2);

/*
* Multiply Packed Single Precision Floating-Point Values
* **** VMULPS ymm1, ymm2, ymm3/m256
* Performs an SIMD multiply of the eight packed single-precision
* floating-point values from the first source operand to the second source
* operand, and stores the packed double-precision floating-point results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10*m20, r1=m11*m21, ..., r6=m16*m26, r7=m17*m27 
extern __m256 __cdecl _mm256_mul_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical OR of Packed Double Precision Floating-Point Values
* **** VORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意:有时得到的结果并不是m1和m2按位或的结果?
extern __m256d __cdecl _mm256_or_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical OR of Packed Single Precision Floating-Point Values
* **** VORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical OR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意:有时得到的结果并不是m1和m2按位或的结果?
extern __m256 __cdecl _mm256_or_ps(__m256 m1, __m256 m2);

/*
* Shuffle Packed Double Precision Floating-Point Values
* **** VSHUFPD ymm1, ymm2, ymm3/m256, imm8
* Moves either of the two packed double-precision floating-point values from
* each double quadword in the first source operand into the low quadword
* of each double quadword of the destination; moves either of the two packed
* double-precision floating-point values from the second source operand into
* the high quadword of each double quadword of the destination operand.
* The selector operand determines which values are moved to the destination
*/
extern __m256d __cdecl _mm256_shuffle_pd(__m256d m1, __m256d m2, const int select);

/*
* Shuffle Packed Single Precision Floating-Point Values
* **** VSHUFPS ymm1, ymm2, ymm3/m256, imm8
* Moves two of the four packed single-precision floating-point values
* from each double qword of the first source operand into the low
* quadword of each double qword of the destination; moves two of the four
* packed single-precision floating-point values from each double qword of
* the second source operand into to the high quadword of each double qword
* of the destination. The selector operand determines which values are moved
* to the destination.
*/
extern __m256 __cdecl _mm256_shuffle_ps(__m256 m1, __m256 m2, const int select);

/*
* Subtract Packed Double Precision Floating-Point Values
* **** VSUBPD ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the four packed double-precision floating-point
* values of the second Source operand from the first Source operand, and
* stores the packed double-precision floating-point results in the destination
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r0=m10-m20, r1=m11-m21, r2=m12-m22, r3=m13-m23
extern __m256d __cdecl _mm256_sub_pd(__m256d m1, __m256d m2);

/*
* Subtract Packed Single Precision Floating-Point Values
* **** VSUBPS ymm1, ymm2, ymm3/m256
* Performs an SIMD subtract of the eight packed single-precision
* floating-point values in the second Source operand from the First Source
* operand, and stores the packed single-precision floating-point results in
* the destination
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r0=m10-m20, r1=m11-m21, ..., r6=m16-m26, r7=m17-m27 
extern __m256 __cdecl _mm256_sub_ps(__m256 m1, __m256 m2);

/*
* Bitwise Logical XOR of Packed Double Precision Floating-Point Values
* **** VXORPD ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the four packed double-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意:有时得到的结果并不是m1和m2按位异或的结果?
extern __m256d __cdecl _mm256_xor_pd(__m256d m1, __m256d m2);

/*
* Bitwise Logical XOR of Packed Single Precision Floating-Point Values
* **** VXORPS ymm1, ymm2, ymm3/m256
* Performs a bitwise logical XOR of the eight packed single-precision
* floating-point values from the first source operand and the second
* source operand, and stores the result in the destination
*/
//注意:有时得到的结果并不是m1和m2按位异或的结果?
extern __m256 __cdecl _mm256_xor_ps(__m256 m1, __m256 m2);

/*
* Compare Packed Double-Precision Floating-Point Values
* **** VCMPPD xmm1, xmm2, xmm3/m128, imm8
* **** VCMPPD ymm1, ymm2, ymm3/m256, imm8
* Performs an SIMD compare of the four packed double-precision floating-point
* values in the second source operand (third operand) and the first source
* operand (second operand) and returns the results of the comparison to the
* destination operand (first operand). The comparison predicate operand
* (immediate) specifies the type of comparison performed on each of the pairs
* of packed values.
* For 128-bit intrinsic with compare predicate values in range 0-7 compiler
* may generate SSE2 instructions if it is warranted for performance reasons.
*/
extern __m128d __cdecl _mm_cmp_pd(__m128d m1, __m128d m2, const int predicate);
extern __m256d __cdecl _mm256_cmp_pd(__m256d m1, __m256d m2, const int predicate);

/*
* Compare Packed Single-Precision Floating-Point Values
* **** VCMPPS xmm1, xmm2, xmm3/m256, imm8
* **** VCMPPS ymm1, ymm2, ymm3/m256, imm8
* Performs a SIMD compare of the packed single-precision floating-point values
* in the second source operand (third operand) and the first source operand
* (second operand) and returns the results of the comparison to the destination
* operand (first operand). The comparison predicate operand (immediate)
* specifies the type of comparison performed on each of the pairs of packed
* values.
* For 128-bit intrinsic with compare predicate values in range 0-7 compiler
* may generate SSE2 instructions if it is warranted for performance reasons.
*/
extern __m128 __cdecl _mm_cmp_ps(__m128 m1, __m128 m2, const int predicate);
extern __m256 __cdecl _mm256_cmp_ps(__m256 m1, __m256 m2, const int predicate);

/*
* Compare Scalar Double-Precision Floating-Point Values
* **** VCMPSD xmm1, xmm2, xmm3/m64, imm8
* Compares the low double-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results in of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies the
* type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128d __cdecl _mm_cmp_sd(__m128d m1, __m128d m2, const int predicate);

/*
* Compare Scalar Single-Precision Floating-Point Values
* **** VCMPSS xmm1, xmm2, xmm3/m64, imm8
* Compares the low single-precision floating-point values in the second source
* operand (third operand) and the first source operand (second operand) and
* returns the results of the comparison to the destination operand (first
* operand). The comparison predicate operand (immediate operand) specifies
* the type of comparison performed.
* For compare predicate values in range 0-7 compiler may generate SSE2
* instructions if it is warranted for performance reasons.
*/
extern __m128 __cdecl _mm_cmp_ss(__m128 m1, __m128 m2, const int predicate);

/*
* Convert Packed Doubleword Integers to
* Packed Double-Precision Floating-Point Values
* **** VCVTDQ2PD ymm1, xmm2/m128
* Converts four packed signed doubleword integers in the source operand to
* four packed double-precision floating-point values in the destination
*/
//从__int32类型转换到double类型
extern __m256d __cdecl _mm256_cvtepi32_pd(__m128i m1);

/*
* Convert Packed Doubleword Integers to
* Packed Single-Precision Floating-Point Values
* **** VCVTDQ2PS ymm1, ymm2/m256
* Converts eight packed signed doubleword integers in the source operand to
* eight packed double-precision floating-point values in the destination
*/
//从__int32类型转换到float类型
extern __m256  __cdecl _mm256_cvtepi32_ps(__m256i m1);

/*
* Convert Packed Double-Precision Floating-point values to
* Packed Single-Precision Floating-Point Values
* **** VCVTPD2PS xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed single-precision floating-point values in the
* destination
*/
//从double类型转换到float类型
extern __m128  __cdecl _mm256_cvtpd_ps(__m256d m1);

/*
* Convert Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination
*/
//从float类型转换到__int32类型
extern __m256i __cdecl _mm256_cvtps_epi32(__m256 m1);

/*
* Convert Packed Single Precision Floating-point values to
* Packed Double Precision Floating-Point Values
* **** VCVTPS2PD ymm1, xmm2/m128
* Converts four packed single-precision floating-point values in the source
* operand to four packed double-precision floating-point values in the
* destination
*/
//从float类型转换到double类型
extern __m256d __cdecl _mm256_cvtps_pd(__m128 m1);

/*
* Convert with Truncation Packed Double-Precision Floating-Point values to
* Packed Doubleword Integers
* **** VCVTTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
//从double类型转换到__int32类型,truncated
extern __m128i __cdecl _mm256_cvttpd_epi32(__m256d m1);

/*
* Convert Packed Double-Precision Floating-point values to
* Packed Doubleword Integers
* **** VCVTPD2DQ xmm1, ymm2/m256
* Converts four packed double-precision floating-point values in the source
* operand to four packed signed doubleword integers in the destination
*/
//从double类型转换到__int32类型
extern __m128i __cdecl _mm256_cvtpd_epi32(__m256d m1);

/*
* Convert with Truncation Packed Single Precision Floating-Point Values to
* Packed Singed Doubleword Integer Values
* **** VCVTTPS2DQ ymm1, ymm2/m256
* Converts eight packed single-precision floating-point values in the source
* operand to eight signed doubleword integers in the destination.
* When a conversion is inexact, a truncated (round toward zero) value is
* returned. If a converted result is larger than the maximum signed doubleword
* integer, the floating-point invalid exception is raised, and if this
* exception is masked, the indefinite integer value (80000000H) is returned
*/
//从float类型转换到__int32类型,truncated
extern __m256i __cdecl _mm256_cvttps_epi32(__m256 m1);

/*
* Extract packed floating-point values
* **** VEXTRACTF128 xmm1/m128, ymm2, imm8
* Extracts 128-bits of packed floating-point values from the source operand
* at an 128-bit offset from imm8[0] into the destination
*/
//offset:a constant integer value that represents the 128-bit offset from 
//where extraction must start
//从256位中提取128位,offset决定提取的起始位置
extern __m128  __cdecl _mm256_extractf128_ps(__m256 m1, const int offset);
extern __m128d __cdecl _mm256_extractf128_pd(__m256d m1, const int offset);
extern __m128i __cdecl _mm256_extractf128_si256(__m256i m1, const int offset);

/*
* Zero All YMM registers
* **** VZEROALL
* Zeros contents of all YMM registers
*/
extern void __cdecl _mm256_zeroall(void);

/*
* Zero Upper bits of YMM registers
* **** VZEROUPPER
* Zeros the upper 128 bits of all YMM registers. The lower 128-bits of the
* registers (the corresponding XMM registers) are unmodified
*/
extern void __cdecl _mm256_zeroupper(void);

/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2, ymm3/m256
* **** VPERMILPS xmm1, xmm2, xmm3/m128
* Permute Single-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of corresponding elements the
* shuffle control and store results in the destination
*/
//control:a vector with 2-bit control fields, one for each corresponding element 
//of the source vector, for the 256-bit m1 source vector this control vector
//contains eight 2-bit control fields,for the 128-bit m1 source vector this 
//control vector contains four 2-bit control fields
extern __m256  __cdecl _mm256_permutevar_ps(__m256 m1, __m256i control);
extern __m128  __cdecl _mm_permutevar_ps(__m128 a, __m128i control);

/*
* Permute Single-Precision Floating-Point Values
* **** VPERMILPS ymm1, ymm2/m256, imm8
* **** VPERMILPS xmm1, xmm2/m128, imm8
* Permute Single-Precision Floating-Point values in the first source operand
* using four 2-bit control fields in the 8-bit immediate and store results
* in the destination
*/
//control:an integer specified as an 8-bit immediate;for the 256-bit m1 vector
//this integer contains four 2-bit control fields in the low 8 bits of 
//the immediate, for the 128-bit m1 vector this integer contains two 2-bit
//control fields in the low 4 bits of the immediate
extern __m256  __cdecl _mm256_permute_ps(__m256 m1, int control);
extern __m128  __cdecl _mm_permute_ps(__m128 a, int control);

/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2, ymm3/m256
* **** VPERMILPD xmm1, xmm2, xmm3/m128
* Permute Double-Precision Floating-Point values in the first source operand
* using 8-bit control fields in the low bytes of the second source operand
* and store results in the destination
*/
//control:a vector with 1-bit control fields, one for each corresponding element
//of the source vector, for the 256-bit m1 source vector this control vector 
//contains four 1-bit control fields in the low 4 bits of the immediate, for the 
//128-bit m1 source vector this control vector contains two 1-bit control fields
//in the low 2 bits of the immediate
extern __m256d __cdecl _mm256_permutevar_pd(__m256d m1, __m256i control);
extern __m128d __cdecl _mm_permutevar_pd(__m128d a, __m128i control);

/*
* Permute Double-Precision Floating-Point Values
* **** VPERMILPD ymm1, ymm2/m256, imm8
* **** VPERMILPD xmm1, xmm2/m128, imm8
* Permute Double-Precision Floating-Point values in the first source operand
* using two, 1-bit control fields in the low 2 bits of the 8-bit immediate
* and store results in the destination
*/
//control:an integer specified as an 8-bit immediate; for the 256-bit m1 vector
//this integer contains four 1-bit control fields in the low 4 bits of the 
//immediate, for the 128-bit m1 vector this integer contains two 1-bit
//control fields in the low 2 bits of the immediate
extern __m256d __cdecl _mm256_permute_pd(__m256d m1, int control);
extern __m128d __cdecl _mm_permute_pd(__m128d a, int control);

/*
* Permute Floating-Point Values
* **** VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
* Permute 128 bit floating-point-containing fields from the first source
* operand and second source operand using bits in the 8-bit immediate and
* store results in the destination
*/
//control:an immediate byte that specifies two 2-bit control fields and two 
//additional bits which specify zeroing behavior
extern __m256  __cdecl _mm256_permute2f128_ps(__m256 m1, __m256 m2, int control);
extern __m256d __cdecl _mm256_permute2f128_pd(__m256d m1, __m256d m2, int control);
extern __m256i __cdecl _mm256_permute2f128_si256(__m256i m1, __m256i m2, int control);

/*
* Load with Broadcast
* **** VBROADCASTSS ymm1, m32
* **** VBROADCASTSS xmm1, m32
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//*a:pointer to a memory location that can hold constant 256-bit or
//128-bit float32 values, 则r0=r1=...=rn=a[0]
extern __m256  __cdecl _mm256_broadcast_ss(float const *a);
extern __m128  __cdecl _mm_broadcast_ss(float const *a);

/*
* Load with Broadcast
* **** VBROADCASTSD ymm1, m64
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//则r0=r1=r2=r3=a[0]
extern __m256d __cdecl _mm256_broadcast_sd(double const *a);

/*
* Load with Broadcast
* **** VBROADCASTF128 ymm1, m128
* Load floating point values from the source operand and broadcast to all
* elements of the destination
*/
//若*a为a[0],a[1],则r0=r2=a[0], r1=r3=a[1]
extern __m256  __cdecl _mm256_broadcast_ps(__m128 const *a);
extern __m256d __cdecl _mm256_broadcast_pd(__m128d const *a);

/*
* Insert packed floating-point values
* **** VINSERTF128 ymm1, ymm2, xmm3/m128, imm8
* Performs an insertion of 128-bits of packed floating-point values from the
* second source operand into an the destination at an 128-bit offset from
* imm8[0]. The remaining portions of the destination are written by the
* corresponding fields of the first source operand
*/
//offset:an integer value that represents the 128-bit offset
//where the insertion must start
//The remaining portions of the destination are written by the corresponding
//elements of the first source vector, a
extern __m256  __cdecl _mm256_insertf128_ps(__m256 a, __m128 b, int offset);
extern __m256d __cdecl _mm256_insertf128_pd(__m256d a, __m128d b, int offset);
extern __m256i __cdecl _mm256_insertf128_si256(__m256i a, __m128i b, int offset);

/*
* Move Aligned Packed Double-Precision Floating-Point Values
* **** VMOVAPD ymm1, m256
* **** VMOVAPD m256, ymm1
* Moves 4 double-precision floating-point values from the source operand to
* the destination
*/
//*a:the address must be 32-byte aligned
extern __m256d __cdecl _mm256_load_pd(double const *a);
extern void    __cdecl _mm256_store_pd(double *a, __m256d b);

/*
* Move Aligned Packed Single-Precision Floating-Point Values
* **** VMOVAPS ymm1, m256
* **** VMOVAPS m256, ymm1
* Moves 8 single-precision floating-point values from the source operand to
* the destination
*/
//*a:the address must be 32-byte aligned
extern __m256  __cdecl _mm256_load_ps(float const *a);
extern void    __cdecl _mm256_store_ps(float *a, __m256 b);

/*
* Move Unaligned Packed Double-Precision Floating-Point Values
* **** VMOVUPD ymm1, m256
* **** VMOVUPD m256, ymm1
* Moves 256 bits of packed double-precision floating-point values from the
* source operand to the destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256d __cdecl _mm256_loadu_pd(double const *a);
extern void    __cdecl _mm256_storeu_pd(double *a, __m256d b);

/*
* Move Unaligned Packed Single-Precision Floating-Point Values
* **** VMOVUPS ymm1, m256
* **** VMOVUPS m256, ymm1
* Moves 256 bits of packed single-precision floating-point values from the
* source operand to the destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256  __cdecl _mm256_loadu_ps(float const *a);
extern void    __cdecl _mm256_storeu_ps(float *a, __m256 b);

/*
* Move Aligned Packed Integer Values
* **** VMOVDQA ymm1, m256
* **** VMOVDQA m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256i __cdecl _mm256_load_si256(__m256i const *a);
extern void    __cdecl _mm256_store_si256(__m256i *a, __m256i b);

/*
* Move Unaligned Packed Integer Values
* **** VMOVDQU ymm1, m256
* **** VMOVDQU m256, ymm1
* Moves 256 bits of packed integer values from the source operand to the
* destination
*/
//The address a does not need to be 32-byte aligned  
extern __m256i __cdecl _mm256_loadu_si256(__m256i const *a);
extern void    __cdecl _mm256_storeu_si256(__m256i *a, __m256i b); 

/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPD xmm1, xmm2, m128
* **** VMASKMOVPD ymm1, ymm2, m256
* **** VMASKMOVPD m128, xmm1, xmm2
* **** VMASKMOVPD m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
//The mask is calculated from the most significant bit of each qword of the mask
//register. If any of the bits of the mask is set to zero, the corresponding value
//from the memory location is not loaded, and the corresponding field of the
//destination vector is set to zero.
extern __m256d __cdecl _mm256_maskload_pd(double const *a, __m256i mask);
extern void    __cdecl _mm256_maskstore_pd(double *a, __m256i mask, __m256d b);
extern __m128d __cdecl _mm_maskload_pd(double const *a, __m128i mask);
extern void    __cdecl _mm_maskstore_pd(double *a, __m128i mask, __m128d b); 

/*
* Conditional SIMD Packed Loads and Stores
* **** VMASKMOVPS xmm1, xmm2, m128
* **** VMASKMOVPS ymm1, ymm2, m256
* **** VMASKMOVPS m128, xmm1, xmm2
* **** VMASKMOVPS m256, ymm1, ymm2
*
* Load forms:
* Load packed values from the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location (third operand) into the destination XMM or YMM register
* (first operand) using a mask in the first source operand (second operand).
*
* Store forms:
* Stores packed values from the XMM or YMM register in the second source
* operand (third operand) into the 128-bit (XMM forms) or 256-bit (YMM forms)
* memory location using a mask in first source operand (second operand).
* Stores are atomic.
*/
//The mask is calculated from the most significant bit of each dword of the mask 
//register. If any of the bits of the mask is set to zero, the corresponding 
//value from the memory location is not loaded, and the corresponding field of
//the destination vector is set to zero.
extern __m256  __cdecl _mm256_maskload_ps(float const *a, __m256i mask);
extern void    __cdecl _mm256_maskstore_ps(float *a, __m256i mask, __m256 b); 
extern __m128  __cdecl _mm_maskload_ps(float const *a, __m128i mask);
extern void    __cdecl _mm_maskstore_ps(float *a, __m128i mask, __m128 b); 

/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSHDUP ymm1, ymm2/m256
* Duplicates odd-indexed single-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a1, a1, a3, a3, a5, a5, a7, a7)
extern __m256  __cdecl _mm256_movehdup_ps(__m256 a);

/*
* Replicate Single-Precision Floating-Point Values
* **** VMOVSLDUP ymm1, ymm2/m256
* Duplicates even-indexed single-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3, a4, a5, a6, a7);则r=(a0, a0, a2, a2, a4, a4, a6, a6)
extern __m256  __cdecl _mm256_moveldup_ps(__m256 a);

/*
* Replicate Double-Precision Floating-Point Values
* **** VMOVDDUP ymm1, ymm2/m256
* Duplicates even-indexed double-precision floating-point values from the
* source operand
*/
//a=(a0, a1, a2, a3), 则r=(a0, a0, a2, a2)
extern __m256d __cdecl _mm256_movedup_pd(__m256d a);

/*
* Move Unaligned Integer
* **** VLDDQU ymm1, m256
* The instruction is functionally similar to VMOVDQU YMM, m256 for loading
* from memory. That is: 32 bytes of data starting at an address specified by
* the source memory operand are fetched from memory and placed in a
* destination
*/
//*a:points to a memory location from where unaligned integer value must be moved
extern __m256i __cdecl _mm256_lddqu_si256(__m256i const *a);

/*
* Store Packed Integers Using Non-Temporal Hint
* **** VMOVNTDQ m256, ymm1
* Moves the packed integers in the source operand to the destination using a
* non-temporal hint to prevent caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_si256(__m256i *p, __m256i a);

/*
* Store Packed Double-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPD m256, ymm1
* Moves the packed double-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_pd(double *p, __m256d a);

/*
* Store Packed Single-Precision Floating-Point Values Using Non-Temporal Hint
* **** VMOVNTPS m256, ymm1
* Moves the packed single-precision floating-point values in the source
* operand to the destination operand using a non-temporal hint to prevent
* caching of the data during the write to memory
*/
//the address must be 32-byte aligned
extern void    __cdecl _mm256_stream_ps(float *p, __m256 a);

/*
* Compute Approximate Reciprocals of Packed Single-Precision Floating-Point Values
* **** VRCPPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the eight
* packed single precision floating-point values in the source operand and
* stores the packed single-precision floating-point results in the destination
*/
//a=(a0, a1, a2, ..., a6, a7);
//则r=(1/a0, 1/a1, ..., 1/a6, 1/a7), 求倒数
extern __m256  __cdecl _mm256_rcp_ps(__m256 a);

/*
* Compute Approximate Reciprocals of Square Roots of
* Packed Single-Precision Floating-point Values
* **** VRSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the approximate reciprocals of the square
* roots of the eight packed single precision floating-point values in the
* source operand and stores the packed single-precision floating-point results
* in the destination
*/
//a=(a0, a1, a2, ..., a6, a7);
//则r=(1/sqrt(a0), 1/sqrt(a1), ..., 1/sqrt(a6), 1/sqrt(a7)), 先开方再求倒数
extern __m256  __cdecl _mm256_rsqrt_ps(__m256 a);

/*
* Square Root of Double-Precision Floating-Point Values
* **** VSQRTPD ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the two or four packed
* double-precision floating-point values in the source operand and stores
* the packed double-precision floating-point results in the destination
*/
//a=(a0, a1, a2, a3, a4);则r=(sqrt(a0),sqrt(a1), sqrt(a2), sqrt(a3)), 求开方
extern __m256d __cdecl _mm256_sqrt_pd(__m256d a);

/*
* Square Root of Single-Precision Floating-Point Values
* **** VSQRTPS ymm1, ymm2/m256
* Performs an SIMD computation of the square roots of the eight packed
* single-precision floating-point values in the source operand stores the
* packed double-precision floating-point results in the destination
*/
//a=(a0, a1, a2, ..., a3, a4);则r=(sqrt(a0),sqrt(a1), ..., sqrt(a2), sqrt(a3)), 求开方
extern __m256  __cdecl _mm256_sqrt_ps(__m256 a);

/*
* Round Packed Double-Precision Floating-Point Values
* **** VROUNDPD ymm1,ymm2/m256,imm8
* Round the four Double-Precision Floating-Point Values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
//a=(22.8, -11.3, -33.8, 4.3),
//若iRoundMode=0x0A, 则r=(23, -11, -33, 5)
//若iRoundMode=0x09, 则r=(22, -12, -34, 4)
extern __m256d __cdecl _mm256_round_pd(__m256d a, int iRoundMode);
#define _mm256_ceil_pd(val)   _mm256_round_pd((val), 0x0A);
#define _mm256_floor_pd(val)  _mm256_round_pd((val), 0x09);

/*
* Round Packed Single-Precision Floating-Point Values
* **** VROUNDPS ymm1,ymm2/m256,imm8
* Round the four single-precision floating-point values values in the source
* operand by the rounding mode specified in the immediate operand and place
* the result in the destination. The rounding process rounds the input to an
* integral value and returns the result as a double-precision floating-point
* value. The Precision Floating Point Exception is signaled according to the
* immediate operand. If any source operand is an SNaN then it will be
* converted to a QNaN.
*/
//用法与_mm256_round_pd相同
extern __m256  __cdecl _mm256_round_ps(__m256 a, int iRoundMode);
#define _mm256_ceil_ps(val)   _mm256_round_ps((val), 0x0A);
#define _mm256_floor_ps(val)  _mm256_round_ps((val), 0x09);

/*
* Unpack and Interleave High Packed Double-Precision Floating-Point Values
* **** VUNPCKHPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the high double-precision floating-point
* values from the first source operand and the second source operand.
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r=(m11, m21, m13, m23)
extern __m256d __cdecl _mm256_unpackhi_pd(__m256d m1, __m256d m2);

/*
* Unpack and Interleave High Packed Single-Precision Floating-Point Values
* **** VUNPCKHPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the high single-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r=(m12, m22, m13, m23, m16, m26, m17, m27)
extern __m256  __cdecl _mm256_unpackhi_ps(__m256 m1, __m256 m2); 

/*
* Unpack and Interleave Low Packed Double-Precision Floating-Point Values
* **** VUNPCKLPD ymm1,ymm2,ymm3/m256
* Performs an interleaved unpack of the low double-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, m12, m13), m2=(m20, m21, m22, m23)
//则r=(m10, m20, m12, m22)
extern __m256d __cdecl _mm256_unpacklo_pd(__m256d m1, __m256d m2);

/*
* Unpack and Interleave Low Packed Single-Precision Floating-Point Values
* **** VUNPCKLPS ymm1,ymm2,ymm3
* Performs an interleaved unpack of the low single-precision floating-point
* values from the first source operand and the second source operand
*/
//m1=(m10, m11, ..., m17), m2=(m20, m21, ..., m27)
//则r=(m10, m20, m11, m21, m14, m24, m15, m25)
extern __m256  __cdecl _mm256_unpacklo_ps(__m256 m1, __m256 m2);

/*
* Packed Bit Test
* **** VPTEST ymm1, ymm2/m256
* VPTEST set the ZF flag if all bits in the result are 0 of the bitwise AND
* of the first source operand and the second source operand. VPTEST sets the
* CF flag if all bits in the result are 0 of the bitwise AND of the second
* source operand and the logical NOT of the destination.
*/
extern int     __cdecl _mm256_testz_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testc_si256(__m256i s1, __m256i s2);
extern int     __cdecl _mm256_testnzc_si256(__m256i s1, __m256i s2);

/*
* Packed Bit Test
* **** VTESTPD ymm1, ymm2/m256
* **** VTESTPD xmm1, xmm2/m128
* VTESTPD performs a bitwise comparison of all the sign bits of the
* double-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int     __cdecl _mm256_testz_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm256_testnzc_pd(__m256d s1, __m256d s2);
extern int     __cdecl _mm_testz_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testc_pd(__m128d s1, __m128d s2);
extern int     __cdecl _mm_testnzc_pd(__m128d s1, __m128d s2);

/*
* Packed Bit Test
* **** VTESTPS ymm1, ymm2/m256
* **** VTESTPS xmm1, xmm2/m128
* VTESTPS performs a bitwise comparison of all the sign bits of the packed
* single-precision elements in the first source operation and corresponding
* sign bits in the second source operand. If the AND of the two sets of bits
* produces all zeros, the ZF is set else the ZF is clear. If the AND NOT of
* the source sign bits with the dest sign bits produces all zeros the CF is
* set else the CF is clear
*/
extern int     __cdecl _mm256_testz_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm256_testnzc_ps(__m256 s1, __m256 s2);
extern int     __cdecl _mm_testz_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testc_ps(__m128 s1, __m128 s2);
extern int     __cdecl _mm_testnzc_ps(__m128 s1, __m128 s2);

/*
* Extract Double-Precision Floating-Point Sign mask
* **** VMOVMSKPD r32, ymm2
* Extracts the sign bits from the packed double-precision floating-point
* values in the source operand, formats them into a 4-bit mask, and stores
* the mask in the destination
*/
extern int     __cdecl _mm256_movemask_pd(__m256d a);

/*
* Extract Single-Precision Floating-Point Sign mask
* **** VMOVMSKPS r32, ymm2
* Extracts the sign bits from the packed single-precision floating-point
* values in the source operand, formats them into a 8-bit mask, and stores
* the mask in the destination
*/
extern int     __cdecl _mm256_movemask_ps(__m256 a);

/*
* Return 256-bit vector with all elements set to 0
*/
//则r0=r1=...=rn=0
extern __m256d __cdecl _mm256_setzero_pd(void);
extern __m256  __cdecl _mm256_setzero_ps(void);
extern __m256i __cdecl _mm256_setzero_si256(void);

/*
* Return 256-bit vector intialized to specified arguments
*/
//则r = (d, c, b, a)
extern __m256d __cdecl _mm256_set_pd(double a, double b, double c, double d);
extern __m256  __cdecl _mm256_set_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_set_epi8(char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_set_epi16(short, short, short, short, short, short, short, short,
	short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_set_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_set_epi64x(long long, long long, long long, long long);

//则r = (a, b, c, d)
extern __m256d __cdecl _mm256_setr_pd(double a, double b, double c, double d);
extern __m256  __cdecl _mm256_setr_ps(float, float, float, float, float, float, float, float);
extern __m256i __cdecl _mm256_setr_epi8(char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char,
	char, char, char, char, char, char, char, char);
extern __m256i __cdecl _mm256_setr_epi16(short, short, short, short, short, short, short, short,
	short, short, short, short, short, short, short, short);
extern __m256i __cdecl _mm256_setr_epi32(int, int, int, int, int, int, int, int);
extern __m256i __cdecl _mm256_setr_epi64x(long long, long long, long long, long long);

/*
* Return 256-bit vector with all elements intialized to specified scalar
*/
//则r0 =  ... = rn = a
extern __m256d __cdecl _mm256_set1_pd(double a);
extern __m256  __cdecl _mm256_set1_ps(float);
extern __m256i __cdecl _mm256_set1_epi8(char);
extern __m256i __cdecl _mm256_set1_epi16(short);
extern __m256i __cdecl _mm256_set1_epi32(int);
extern __m256i __cdecl _mm256_set1_epi64x(long long);

/*
* Support intrinsics to do vector type casts. These intrinsics do not introduce
* extra moves to generated code. When cast is done from a 128 to 256-bit type
* the low 128 bits of the 256-bit result contain source parameter value; the
* upper 128 bits of the result are undefined
*/
//类型转换
extern __m256  __cdecl _mm256_castpd_ps(__m256d a);
extern __m256d __cdecl _mm256_castps_pd(__m256 a);
extern __m256i __cdecl _mm256_castps_si256(__m256 a);
extern __m256i __cdecl _mm256_castpd_si256(__m256d a);
extern __m256  __cdecl _mm256_castsi256_ps(__m256i a);
extern __m256d __cdecl _mm256_castsi256_pd(__m256i a);
extern __m128  __cdecl _mm256_castps256_ps128(__m256 a);
extern __m128d __cdecl _mm256_castpd256_pd128(__m256d a);
extern __m128i __cdecl _mm256_castsi256_si128(__m256i a);
extern __m256  __cdecl _mm256_castps128_ps256(__m128 a);
extern __m256d __cdecl _mm256_castpd128_pd256(__m128d a);
extern __m256i __cdecl _mm256_castsi128_si256(__m128i a);

/* Start of new intrinsics for Dev10 SP1
*
* The list of extended control registers.
* Currently, the list includes only one register.
*/
#define _XCR_XFEATURE_ENABLED_MASK 0

/* Returns the content of the specified extended control register */
extern unsigned __int64 __cdecl _xgetbv(unsigned int ext_ctrl_reg);

/* Writes the value to the specified extended control register */
extern void __cdecl _xsetbv(unsigned int ext_ctrl_reg, unsigned __int64 val);


/* 
* Performs a full or partial save of the enabled processor state components
* using the the specified memory address location and a mask.
*/
extern void __cdecl _xsave(void *mem, unsigned __int64 save_mask);
extern void __cdecl _xsave64(void *mem, unsigned __int64 save_mask);

/* 
* Performs a full or partial save of the enabled processor state components
* using the the specified memory address location and a mask.
* Optimize the state save operation if possible.
*/
extern void __cdecl _xsaveopt(void *mem, unsigned __int64 save_mask);
extern void __cdecl _xsaveopt64(void *mem, unsigned __int64 save_mask);

/* 
* Performs a full or partial restore of the enabled processor states
* using the state information stored in the specified memory address location
* and a mask.
*/
extern void __cdecl _xrstor(void *mem, unsigned __int64 restore_mask);
extern void __cdecl _xrstor64(void *mem, unsigned __int64 restore_mask);

/* 
* Saves the current state of the x87 FPU, MMX technology, XMM,
* and MXCSR registers to the specified 512-byte memory location.
*/
extern void __cdecl _fxsave(void *mem);
extern void __cdecl _fxsave64(void *mem);

/* 
* Restore the current state of the x87 FPU, MMX technology, XMM,
* and MXCSR registers from the specified 512-byte memory location.
*/
extern void __cdecl _fxrstor(void *mem);
extern void __cdecl _fxrstor64(void *mem);

分享到:
评论

相关推荐

    Intel C++ Compiler Intrinsics Reference

    这份文档是Intel官方提供的关于其C++编译器内部函数(Intrinsics)的参考指南,旨在帮助开发者更高效地利用Intel处理器的特性,从而提升代码执行效率。 ### Intel C++ Compiler Intrinsics简介 Intel C++ Compiler...

    intel_c++_intrinsics

    开发人员可以通过在C++代码中嵌入特定的Intrinsics函数来调用这些底层指令。例如,使用`__m256`类型和`_mm256_add_ps`函数可以执行256位的浮点加法操作。 ### 注意事项 虽然Intel C++ Intrinsics提供了强大的性能...

    x86 intrinsics cheat sheet v1.0.pdf

    这份文件名为“x86 intrinsics cheat sheet v1.0.pdf”,是一份关于x86指令集的内建函数速查表,主要面向使用Intel处理器的开发者。速查表中涵盖了Intel处理器支持的大部分x86内建函数,但不包括一些过时或已废弃的...

    康威生命游戏C语言并行实现(intrinsics)

    在C语言中,使用intrinsics可以充分利用CPU的向量运算单元(如SSE、AVX),提高计算密集型任务的性能。在康威生命游戏中,大量细胞状态的更新工作正适合这样的优化。 在康威生命游戏的SIMD(Single Instruction ...

    intel SIMD指令参考手册

    `Intel Intrinsics Guide`是Intel提供的一个工具,它详列了所有这些SIMD扩展的内联函数(Intrinsics),这些函数可以直接在C/C++代码中使用,为开发者提供了一种高效利用SIMD指令的方法。通过使用Intrinsics,开发者...

    xsimd:用于SIMD内在函数和并行的,优化的数学函数(SSE,AVX,NEON,AVX512)的C ++包装器

    **xsimd:SIMD内建函数与并行优化数学库** `xsimd`是一个开源的C++库,专为向量处理和并行计算而设计。它提供了对SIMD(单指令多数据)技术的封装,使开发人员能够利用现代处理器的硬件加速功能,如SSE、AVX、NEON...

    C 代码 阐释了 C 语言提供的内部函数的使用.rar

    在C语言中,内部函数(也称为内联函数或内建函数)是编译器提供的一种特殊功能,它们被设计用来提高程序的执行效率。这些函数通常与标准库中的函数类似,但它们的调用过程不会产生函数调用开销,因为编译器会尝试在...

    C 代码 演示可用的内部函数 由 Gnu gcc 编译器为 C 编写.rar

    2. **向量运算内建函数**:GCC支持SSE、AVX等向量指令集,提供如__builtin_ia32_vec_init_v4si()这样的函数来创建向量,以及__builtin_ia32_vec_ext_v4si()进行元素提取等。 3. **原子操作内建函数**:如__sync_...

    X86优化之Intrinsic汇编基础.docx

    本文档旨在介绍X86 Intrinsic的基础知识及其应用场景,帮助读者更好地理解并掌握Intrinsic在实际开发中的应用。 #### 二、Intrinsic优化概述 在X86架构下,程序员通常可以通过多种方式进行优化,主要包括纯汇编...

    simd.tar.gz

    Intrinsics是C++和C语言的内联函数,它们映射到硬件指令,允许程序员直接利用SIMD和AVX的功能,而无需直接操作底层硬件。通过使用Intrinsics,开发者可以编写出高效、可移植的代码,同时充分利用现代处理器的并行...

    60fcf3ccd9a2ebc9e432ffb5cb91638b-ippicv_2017u2_win_ia32_20170418

    IPPIW 是 Intel Performance Primitives Intrinsics Library,包含了 Intel 处理器的向量指令集(如 SSE、AVX)的内联函数,便于开发者编写高效代码。而 IPPIV 是 IPPICV 的核心部分,提供了大量的图像处理函数。 ...

    B-tree-build-search-parallel-avx

    3. **性能优化**:在C++中,可以使用intrinsics函数来直接调用AVX指令,实现高效的并行计算。 综上所述,"B-tree-build-search-parallel-avx"项目涉及了B树的构建、并行搜索算法以及利用AVX指令集进行性能优化的C++...

    IntrinsicsPlayground:我的玩具可以在纯C#(.NET Core 2.1)中与SSEAVX一起玩

    内部游乐场我的玩具在纯C#中使用内在函数(请参见System.Runtime.Intrinsics.Experimental ) 环境: BenchmarkDotNet =v0.10.14, OS =Windows 10.0.17134Intel Core i7-8700K CPU 3.70GHz (Coffee Lake), 1 CPU, ...

    22春南开大学《并行程序设计》在线作业二满分答案2.docx

    pthread_barrier_init() 函数用于初始化障碍,需要提供的参数包括障碍对象、障碍初值、障碍属性和参与的线程数。Barrier 是一种同步机制,用于等待所有线程达到某个点后继续执行。 2. SSE intrinsics SSE ...

    20春南开《并行程序设计》在线作业答案.docx

    28. **AVX矩阵乘法**:使用AVX进行单精度浮点数矩阵乘法,通常进行2路或4路循环展开,因为AVX可以同时处理2或4个单精度浮点数。 这些知识点涵盖了并行程序设计的多个重要概念,包括线程管理、向量化优化、并行计算...

    南开大学2021年9月《并行程序设计》作业考核试题及答案参考7.pdf

    15. AVX 寄存器:每个 AVX 寄存器宽度为 256 位。 16. SIMD 并行模式:SIMD 并行模式是将多个相同运算打包并发执行。 17. 现代 CPU:现代 CPU 中都具有指令乱序执行功能,其目的包括降低访存延迟、隐藏了访存延迟...

    南开大学21春学期(1709-2103)《并行程序设计》在线作业-答案1.pdf

    25. **AVX加速比**:对于单精度浮点数矩阵乘法的AVX并行化,期望加速比通常小于8,因为除了计算速度提升外,还受到其他因素如数据搬运、同步开销的影响。 26. **OpenMP线程数指定**:在OpenMP中,可以通过`omp_set_...

    21春南开大学《并行程序设计》在线作业参考答案.docx

    17. **线程函数参数**:在pthread_create中设置一个参数为“线程函数参数”,是因为线程函数不是由用户程序直接调用,系统需要通过这种方式传递参数。 18. **SSE寄存器数量**:SSE有8个专用寄存器。 19. **操作...

    南开大学20秋《并行程序设计》在线作业-1(参考答案).pdf

    22. 创建线程时,我们通过 pthread_create 的“线程函数参数”参数将线程号分别传递给每个线程。 23. 利用 cache line 一次读取多个数据字的机制优化程序访存性能,其机理是利用了 cache 空间局部性。 24. R*=1.3;...

    simde:SIMD指令集的实现适用于本机不支持它们的系统

    SIMDe使(例如)SSE / AVX和NEON代码在同一实现中并排存在。 其次,SIMDe使您更容易编写针对您无法方便访问的扩展的代码。 您可以在没有模拟器的情况下在x86机器上运行NEON代码。 显然,您最终将要在要定位的实际...

Global site tag (gtag.js) - Google Analytics