英特尔多核平台编码优化大赛的优化过程--补充

2026/1/23 15:24:07

分类：代码优化2007-01-20 17:20 2521人阅读评论(2) 收藏举报

英特尔多核平台编码优化大赛的优化过程--补充 HouSisong@GMail.com 2007.01.20 tag: 多核编程,sse2,牛顿迭代,代码优化,优化大赛,invsqrt,开方主要文章请参看我的《英特尔多核平台编码优化大赛的优化过程》：

http://blog.csdn.net/housisong/archive/2007/01/20/1488465.aspx 本文章是其补充；提供一个完整的float实现版本、double到float的手工转换、手工得到invSqrt的粗略起始迭代值等其它几个不算成功的实现；我测试和优化过程中用的 CPU：AMD64x2 3600+ (双核CPU) 操作系统：Windows XP 32bit 编译器：Visual Studio 2005 大赛公布的原始代码执行时间 3.97秒

一个float完整实现版本(牺牲了计算精度)，源代码如下：

(如果用汇编实现应该还可以把速度提高一些，或者使用ICC编译器:) /* compute the potential energy of a collection of */ /* particles interacting via pairwise potential */ #include #include #include #include

#include //使用SSE1 #include #include

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

//作者：侯思松 HouSisong@263.net //计算结果的精度为float单精度浮点版本

#define _IS_FAST

//以牺牲精度的方式加快速度，否则就运算达到float单精度

////////////////////////////////////////////////////////////////////////////////////////////////////////////////

//#define _NEW_TIME_CLOCK

#ifdef _NEW_TIME_CLOCK #define clock_t double double CLOCKS_PER_SEC=0.0; inline double clock() { __int64 result;

if (CLOCKS_PER_SEC==0) {

QueryPerformanceFrequency((LARGE_INTEGER *)&result); CLOCKS_PER_SEC=(double)result; }

QueryPerformanceCounter((LARGE_INTEGER *)&result); return (double)result; } #else

#include #endif

#define _IS_USES_MY_RAND

//单线程执行rand函数，所以使用自定义rand是安全的

const long DefthreadCount=2; //1,2,4,8,16,.. 把计算任务分成多个任务并行执行

float& m128_value(__m128& x,const long index) { return ((float*)(&x))[index]; }

#define NPARTS 1000 #define NITER 201 #define DIMS 3

#ifdef _IS_USES_MY_RAND class CMyRand {

private:

unsigned long _my_holdrand; public:

CMyRand():_my_holdrand(1){} inline int _my_rand (void) {

unsigned long result=_my_holdrand * 214013 + 2531011; _my_holdrand =result;

return ( (result>>16) & RAND_MAX ); } };

CMyRand _MyRand;

inline int _my_rand (void){ return _MyRand._my_rand(); } #else

#define _my_rand rand #endif

int rand( void ); int computePot();

void initPositions(void); void updatePositions(void);

__declspec(align(16)) float r[DIMS][(NPARTS+3)/4*4]; //16byte对齐 double pot;

int main() { int i;

clock_t start, stop;

//char ctmp; std::cin>>ctmp;

initPositions(); updatePositions();

start=clock();

for( i=0; i

printf(\ updatePositions(); }

pot = 0.0; stop=clock();

printf (\double)(stop-start)/ CLOCKS_PER_SEC);

return 0; }

void initPositions() { int i, j;

for( i=0; i

r[i][j]= (float)( 0.5 + _my_rand() *(1.0/RAND_MAX) ); } }

英特尔多核平台编码优化大赛的优化过程--补充.doc 将本文的Word文档下载到电脑

下载这篇word文档