SSE的优势简介

beyondjhf_2008

浏览: 270435 次
性别:
来自: 沈阳

最近访客更多访客>>

Detective_sxz

likg_java

vigour36

ccnuliuyujie

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

SSE编程

算法

为了方便对比速度，我会用常归方法和SSE优化两种写法写出，并会用一个测试速度的类CTimer来进行计时。这个算法是对一组float值进行放大，函数ScaleValue1是使用SSE指令优化的，函数ScaleValue2则没有。我们用10000个元素的float数组数据来测试这两个算法，每个算法运算10000遍，下面是测试程序和结果：

Use SSE：2.07543e+012秒
Not Use SSE：-2.5293e+012秒
请按任意键继续. . .

测试代码如下：
/******test.cpp*******/
#include <xmmintrin.h>
#include<iostream>
#include <windows.h>
using namespace std;

class CTimer

{
public:
__forceinline CTimer( void )
{
QueryPerformanceFrequency( &m_Frequency );
QueryPerformanceCounter( &m_StartCount );
}
__forceinline void Reset( void )
{
QueryPerformanceCounter( &m_StartCount );
}
__forceinline double End( void )
{
static __int64 nCurCount;
QueryPerformanceCounter( (PLARGE_INTEGER)&nCurCount );
return double( nCurCount * ( *(__int64*)&m_StartCount ) ) / double( *(__int64*)&m_Frequency );
}
private:
LARGE_INTEGER m_Frequency;
LARGE_INTEGER m_StartCount;
};
void ScaleValue1( float *pArray, DWORD dwCount, float fScale )
{
DWORD dwGroupCount = dwCount / 4;
__m128 e_Scale = _mm_set_ps1( fScale );
for ( DWORD i = 0; i < dwGroupCount; i++ )
{
*(__m128*)( pArray + i * 4 ) = _mm_mul_ps( *(__m128*)( pArray + i * 4 ), e_Scale );
}
}
void ScaleValue2( float *pArray, DWORD dwCount, float fScale )
{
for ( DWORD i = 0; i < dwCount; i++ )
{
pArray[i] *= fScale;
}

}

#define ARRAYCOUNT 10000
int __cdecl main()
{
float __declspec(align(16)) Array[ARRAYCOUNT];
memset( Array, 0, sizeof(float) * ARRAYCOUNT );
CTimer t;
double dTime;
t.Reset();
for ( int i = 0; i < 100000; i++ )
{
ScaleValue1( Array, ARRAYCOUNT, 1000.0f );
}
dTime = t.End();
cout << "Use SSE：" << dTime << "秒" << endl;
t.Reset();
for ( int i = 0; i < 100000; i++ )
{
ScaleValue2( Array, ARRAYCOUNT, 1000.0f );
}
dTime = t.End();
cout << "Not Use SSE：" << dTime << "秒" << endl;
system( "pause" );
return 0;
}

分享到：