#include <iostream>
#include <windows.h> //微秒计时器
#include <time.h> //clock函数
using namespace std;
//旨在验证5.5\5.6的指令级并行,减少循环次数,减少内存访问次数对于运算时间的影响;
//验证结果:
//与预期严重不符!!!version2运行所用周期平均在230左右;version2所用周期在330左右。
//version1
double PolyMut1(double a[], double x, long degree)
{
//a为多项式系数数组;degree为函数最高次幂;x为变量数值;
long i;
double result = a[0];
double xpwr = x;
for(i = 1; i <= degree; ++i){
result += a[i] * xpwr; //第i项值;
xpwr *= x; //下一项的变量次方;
}
return result;
}
//version2
double PolyMut2(double a[], double x, long degree)
{
long i;
double result = a[degree];
for(i = degree-1; i >= 0; i--){
result = a[i] + x * result;
}
return result;
}
//version3
double PolyMut3(double a[], double x, long degree)
{
double xpwr1 = x;
double xpwr2 = x*xpwr1;
double xpwr3 = x*xpwr2;;
double xpwr4 = x*xpwr3;
double xpwr5 = x*xpwr4;
double last_xpwr = xpwr5;
double ResultAll = 0;
double result0 = a[0];
double result1 = 0;
double result2 = 0;
double result3 = 0;
double result4 = 0;
double result5 = 0;
int i = 0;
for(i = 1; (i+4) < degree; i +=5)
{
//尝试展开循环,利用并行;
result1 = a[i] * xpwr1;
result2 = a[i+1] * xpwr2;
result3 = a[i+2] * xpwr3;
result4 = a[i+3] * xpwr4;
result5 = a[i+4] * xpwr5;
xpwr1 *= last_xpwr;
xpwr2 *= last_xpwr;
xpwr3 *= last_xpwr;
xpwr4 *= last_xpwr;
xpwr5 *= last_xpwr;
// last_xpwr = xpwr5; 列出公式,不难发现这一步是不必要的。
}
//补上展开循环后剩余的项;
for(; i < degree; ++i){
result0 += a[i] * xpwr1;
xpwr1 *= x;
}
//总和;
ResultAll = result0 + result1 + result2 + result3 + result4 + result5;
return ResultAll;
}
int main()
{
//测试方程;
double coefficient[1000] = {0};
for(int i=0; i < 1000; i++){
coefficient[i] = 2;
}
double x = 314159265;
//计时模块1——未知错误,显示为0——尝试用滴答数直接表示或者乘以1000000表示为微秒;
long long run_time;
LARGE_INTEGER time_start;
LARGE_INTEGER time_over;
double dqFreq;
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
dqFreq = (double)f.QuadPart;
QueryPerformanceCounter(&time_start);
//计时模块2;
// int Begin_time = clock();
//version1
// double result = PolyMut1(coefficient, x, 1000);
//version2
double result = PolyMut2(coefficient, x, 1000);
// Sleep(1000);
// int End_time = clock();
//version3
// double result = PolyMut3(coefficient, x, 1000);
QueryPerformanceCounter(&time_over);
run_time = (time_over.QuadPart - time_start.QuadPart)*1000000/dqFreq ;
cout<<"the result is: "<<result<<endl;
cout<<run_time<<" "<<dqFreq<<endl;
// cout<<"the running time is: "<<(End_time - Begin_time)/clock<<endl;
return 0;
}
最终的结果都是inf—无穷大(或解释为超出浮点数范围)。但同样如此,计算所用时间却不同。用微秒计数,version2理论上的CPE值最大,用时应当最长,version3的CPE最小,应用时最短。
实测下来,version2运行6次的平均时间220-230;version2需要260左右,version1需要300出头!!!version1劣于version2可以解释为编译器的优化;version即用了循环展开,又用了并行(并行有多种,没法分清,统一称之为并行),应当是最优解才对。
很不解,很蒙圈。