C++高性能计算入门（优化一个N体引力问题）

最新推荐文章于 2024-06-13 17:20:58 发布

wwxy261

最新推荐文章于 2024-06-13 17:20:58 发布

阅读量545

点赞数

分类专栏：算法文章标签： c++ 开发语言后端

本文链接：https://blog.csdn.net/wwxy1995/article/details/122437888

版权

算法专栏收录该内容

3633 篇文章 117 订阅

订阅专栏

先用Modern C++实现一个demo

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

struct Star {
    float px, py, pz;
    float vx, vy, vz;
    float mass;
};

std::vector<Star> stars;

void init() {
    for (int i = 0; i < 48; i++) {
        stars.push_back({
            frand(), frand(), frand(),
            frand(), frand(), frand(),
            frand() + 1,
        });
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (auto &star: stars) {
        for (auto &other: stars) {
            float dx = other.px - star.px;
            float dy = other.py - star.py;
            float dz = other.pz - star.pz;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            star.vx += dx * other.mass * G * dt / d2;
            star.vy += dy * other.mass * G * dt / d2;
            star.vz += dz * other.mass * G * dt / d2;
        }
    }
    for (auto &star: stars) {
        star.px += star.vx * dt;
        star.py += star.vy * dt;
        star.pz += star.vz * dt;
    }
}

float calc() {
    float energy = 0;
    for (auto &star: stars) {
        float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
        energy += star.mass * v2 / 2;
        for (auto &other: stars) {
            float dx = other.px - star.px;
            float dy = other.py - star.py;
            float dz = other.pz - star.pz;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= other.mass * star.mass * G / sqrt(d2) / 2;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

首先是编译优化：-O3， -ffast-math -march=native

加上后者速度提高5倍

直接改成SOA内存布局，在Mac平台下速度反而变慢

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <array>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

const size_t N = 48;

struct Stars {
    std::array<float, N> px, py, pz;
    std::array<float, N> vx, vy, vz;
    std::array<float, N> mass;
}stars;

void init() {
    for (size_t i = 0; i < 48; i++) {
        stars.px[i]=frand();
        stars.py[i]=frand();
        stars.pz[i]=frand();
        stars.vx[i]=frand();
        stars.vy[i]=frand();
        stars.vz[i]=frand();
        stars.mass[i]=frand()+1;
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (size_t i = 0; i < N; i++) {
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - stars.px[i];
            float dy = stars.py[j] - stars.py[i];
            float dz = stars.pz[j] - stars.pz[i];
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            stars.vx[i] += dx * stars.mass[j] * G * dt / d2;
            stars.vy[i] += dy * stars.mass[j] * G * dt / d2;
            stars.vz[i] += dz * stars.mass[j] * G * dt / d2;
        }
    }
    for (size_t i = 0; i < N; i++) {
        stars.px[i] += stars.vx[i] * dt;
        stars.py[i] += stars.vy[i] * dt;
        stars.pz[i] += stars.vz[i] * dt;
    }
}

float calc() {
    float energy = 0;
    for (size_t i = 0; i < N; i++) {
        float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
        energy += stars.mass[i] * v2 / 2.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - stars.px[i];
            float dy = stars.py[j] - stars.py[i];
            float dz = stars.pz[j] - stars.pz[i];
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) / 2.0f;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

这是因为在循环中对数组访问含有i和j指标，所以不能很好的向量化，需要用临时变量保存，采样如下写法加速如下：

#include <cstdio>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <cmath>
#include <array>

float frand() {
    return (float)std::rand() / RAND_MAX * 2 - 1;
}

const size_t N = 48;

struct Stars {
    std::array<float, N> px, py, pz;
    std::array<float, N> vx, vy, vz;
    std::array<float, N> mass;
}stars;

void init() {
    for (size_t i = 0; i < 48; i++) {
        stars.px[i]=frand();
        stars.py[i]=frand();
        stars.pz[i]=frand();
        stars.vx[i]=frand();
        stars.vy[i]=frand();
        stars.vz[i]=frand();
        stars.mass[i]=frand()+1;
    }
}

float G = 0.001;
float eps = 0.001;
float dt = 0.01;

void step() {
    for (size_t i = 0; i < N; i++) {
        float px_tmp = stars.px[i];
        float py_tmp = stars.py[i];
        float pz_tmp = stars.pz[i];
        float vx_tmp = 0.0f;
        float vy_tmp = 0.0f;
        float vz_tmp = 0.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - px_tmp;
            float dy = stars.py[j] - py_tmp;
            float dz = stars.pz[j] - pz_tmp;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            d2 *= std::sqrt(d2);
            vx_tmp += dx * stars.mass[j] * G * dt / d2;
            vy_tmp += dy * stars.mass[j] * G * dt / d2;
            vz_tmp += dz * stars.mass[j] * G * dt / d2;
        }
        stars.vx[i] += vx_tmp;
        stars.vy[i] += vy_tmp;
        stars.vz[i] += vz_tmp;
    }
    for (size_t i = 0; i < N; i++) {
        stars.px[i] += stars.vx[i] * dt;
        stars.py[i] += stars.vy[i] * dt;
        stars.pz[i] += stars.vz[i] * dt;
    }
}

float calc() {
    float energy = 0;
    for (size_t i = 0; i < N; i++) {
        float px_tmp = stars.px[i];
        float py_tmp = stars.py[i];
        float pz_tmp = stars.pz[i];
        float mass_tmp = stars.mass[i];
        float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
        energy += stars.mass[i] * v2 / 2.0f;
        for (size_t j = 0; j < N; j++) {
            float dx = stars.px[j] - px_tmp;
            float dy = stars.py[j] - py_tmp;
            float dz = stars.pz[j] - pz_tmp;
            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
            energy -= stars.mass[j] * mass_tmp * G / std::sqrt(d2) / 2.0f;
        }
    }
    return energy;
}

template <class Func>
long benchmark(Func const &func) {
    auto t0 = std::chrono::steady_clock::now();
    func();
    auto t1 = std::chrono::steady_clock::now();
    auto dt = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
    return dt.count();
}

int main() {
    init();
    printf("Initial energy: %f\n", calc());
    auto dt = benchmark([&] {
        for (int i = 0; i < 100000; i++)
            step();
    });
    printf("Final energy: %f\n", calc());
    printf("Time elapsed: %ld ms\n", dt);
    return 0;
}

以上写法基本答到要求。大概有8倍左右加速。

wwxy261

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
C++高性能计算入门（优化一个N体引力问题）

先用Modern C++实现一个demo#include <cstdio>#include <cstdlib>#include <vector>#include <chrono>#include <cmath>float frand() { return (float)std::rand() / RAND_MAX * 2 - 1;}struct Star { float px, py, pz; fl.
复制链接

扫一扫