C++高性能计算入门(优化一个N体引力问题)

先用Modern C++实现一个demo 

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

struct Star {
    float px, py, pz;
    float vx, vy, vz;
    float mass;
};

std::vector<Star> stars;

void init() {
    for (int i = 0; i < 48; i++) {
        stars.push_back({
            frand(), frand(), frand(),
            frand(), frand(), frand(),
            frand() + 1,
        });
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (auto &star: stars) {
        for (auto &other: stars) {
            float dx = other.px - star.px;
            float dy = other.py - star.py;
            float dz = other.pz - star.pz;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            star.vx += dx * other.mass * G * dt / d2;
            star.vy += dy * other.mass * G * dt / d2;
            star.vz += dz * other.mass * G * dt / d2;
        }
    }
    for (auto &star: stars) {
        star.px += star.vx * dt;
        star.py += star.vy * dt;
        star.pz += star.vz * dt;
    }
}

float calc() {
    float energy = 0;
    for (auto &star: stars) {
        float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
        energy += star.mass * v2 / 2;
        for (auto &other: stars) {
            float dx = other.px - star.px;
            float dy = other.py - star.py;
            float dz = other.pz - star.pz;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= other.mass * star.mass * G / sqrt(d2) / 2;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

首先是编译优化:-O3, -ffast-math -march=native

加上后者速度提高5倍

直接改成SOA内存布局,在Mac平台下速度反而变慢

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <array>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

const size_t N = 48;

struct Stars {
    std::array<float, N> px, py, pz;
    std::array<float, N> vx, vy, vz;
    std::array<float, N> mass;
}stars;

void init() {
    for (size_t i = 0; i < 48; i++) {
        stars.px[i]=frand();
        stars.py[i]=frand();
        stars.pz[i]=frand();
        stars.vx[i]=frand();
        stars.vy[i]=frand();
        stars.vz[i]=frand();
        stars.mass[i]=frand()+1;
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (size_t i = 0; i < N; i++) {
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - stars.px[i];
            float dy = stars.py[j] - stars.py[i];
            float dz = stars.pz[j] - stars.pz[i];
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            stars.vx[i] += dx * stars.mass[j] * G * dt / d2;
            stars.vy[i] += dy * stars.mass[j] * G * dt / d2;
            stars.vz[i] += dz * stars.mass[j] * G * dt / d2;
        }
    }
    for (size_t i = 0; i < N; i++) {
        stars.px[i] += stars.vx[i] * dt;
        stars.py[i] += stars.vy[i] * dt;
        stars.pz[i] += stars.vz[i] * dt;
    }
}

float calc() {
    float energy = 0;
    for (size_t i = 0; i < N; i++) {
        float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
        energy += stars.mass[i] * v2 / 2.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - stars.px[i];
            float dy = stars.py[j] - stars.py[i];
            float dz = stars.pz[j] - stars.pz[i];
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) / 2.0f;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

这是因为在循环中对数组访问含有i和j指标,所以不能很好的向量化,需要用临时变量保存,采样如下写法加速如下:

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <array>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

const size_t N = 48;

struct Stars {
    std::array<float, N> px, py, pz;
    std::array<float, N> vx, vy, vz;
    std::array<float, N> mass;
}stars;

void init() {
    for (size_t i = 0; i < 48; i++) {
        stars.px[i]=frand();
        stars.py[i]=frand();
        stars.pz[i]=frand();
        stars.vx[i]=frand();
        stars.vy[i]=frand();
        stars.vz[i]=frand();
        stars.mass[i]=frand()+1;
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (size_t i = 0; i < N; i++) {
        float px_tmp = stars.px[i];
        float py_tmp = stars.py[i];
        float pz_tmp = stars.pz[i];
        float vx_tmp = 0.0f;
        float vy_tmp = 0.0f;
        float vz_tmp = 0.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - px_tmp;
            float dy = stars.py[j] - py_tmp;
            float dz = stars.pz[j] - pz_tmp;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            vx_tmp += dx * stars.mass[j] * G * dt / d2;
            vy_tmp += dy * stars.mass[j] * G * dt / d2;
            vz_tmp += dz * stars.mass[j] * G * dt / d2;
        }
        stars.vx[i] += vx_tmp;
        stars.vy[i] += vy_tmp;
        stars.vz[i] += vz_tmp;
    }
    for (size_t i = 0; i < N; i++) {
        stars.px[i] += stars.vx[i] * dt;
        stars.py[i] += stars.vy[i] * dt;
        stars.pz[i] += stars.vz[i] * dt;
    }
}

float calc() {
    float energy = 0;
    for (size_t i = 0; i < N; i++) {
        float px_tmp = stars.px[i];
        float py_tmp = stars.py[i];
        float pz_tmp = stars.pz[i];
        float mass_tmp = stars.mass[i];
        float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
        energy += stars.mass[i] * v2 / 2.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - px_tmp;
            float dy = stars.py[j] - py_tmp;
            float dz = stars.pz[j] - pz_tmp;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= stars.mass[j] * mass_tmp * G / std::sqrt(d2) / 2.0f;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

以上写法基本答到要求。大概有8倍左右加速。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值