java8 gpu编程_网上说 Java 的性能已经达到甚至超过 C++,是真的吗?

这应该是和 .NET/C# 搞混了。现在很多游戏都是 C# 开发的。

如果你愿意,C# 完全可以当缺乏预处理步骤的 C 来用。一般来讲,C 能有什么性能,C# 也能达到。更不要说 C# 还能方便地使用 intrinsics 和进行 GPU 编程开发。而且 .NET 的 JIT 编译器也很给力。

举个小例子,这是 int128_t 的乘法:

__int128 volatile a0 = 5;

__int128 volatile b0 = 4;

__int128 volatile c0;

int main() {

auto a = a0;

auto b = b0;

c0 = a * b;

return 0;

}

这是编译结果:

__int128 volatile a0 = 5;

__int128 volatile b0 = 4;

__int128 volatile c0;

int main() {

00000001400027A0 sub rsp,28h

00000001400027A4 call 00000001400016F0

auto a = a0;

00000001400027A9 mov r10,qword ptr [140003020h]

00000001400027B0 mov r11,qword ptr [140003028h]

auto b = b0;

00000001400027B7 mov r8,qword ptr [140003010h]

00000001400027BE mov r9,qword ptr [140003018h]

c0 = a * b;

00000001400027C5 mov rcx,r11

00000001400027C8 mov rax,r10

00000001400027CB mov rdx,r9

00000001400027CE imul rcx,r8

00000001400027D2 imul rdx,r10

00000001400027D6 add rcx,rdx

00000001400027D9 mul rax,r8

00000001400027DC mov qword ptr [140007050h],rax

00000001400027E3 add rdx,rcx

return 0;

}

00000001400027E6 xor eax,eax

c0 = a * b;

00000001400027E8 mov qword ptr [140007058h],rdx

return 0;

}

00000001400027EF add rsp,28h

00000001400027F3 ret

对比 .NET 版本:(Int128 来自库 UltimateOrb.Core 的预览版,默认会 check overflow。)

using System.Runtime.CompilerServices;

using Int128 = UltimateOrb.Int128;

class Program {

static Int128 a0 = 5;

static Int128 b0 = 4;

static Int128 c0;

[MethodImpl(MethodImplOptions.AggressiveOptimization)]

static int Main(string[] args) {

var a = a0;

var b = b0;

c0 = Int128.op_MultiplyUnchecked(a, b);

return 0;

}

}

JIT 编译结果:

static int Main(string[] args) {

var a = a0;

00007FFA37FBB490 sub rsp,28h

00007FFA37FBB494 xor eax,eax

00007FFA37FBB496 mov qword ptr [rsp+20h],rax

00007FFA37FBB49B mov rcx,7FFA38052DA8h

00007FFA37FBB4A5 mov edx,0Dh

00007FFA37FBB4AA call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE (07FFA97B0A770h)

00007FFA37FBB4AF mov rdx,1ED90002CA0h

00007FFA37FBB4B9 mov rdx,qword ptr [rdx]

00007FFA37FBB4BC add rdx,8

00007FFA37FBB4C0 mov rax,qword ptr [rdx]

00007FFA37FBB4C3 mov rcx,qword ptr [rdx+8]

var b = b0;

00007FFA37FBB4C7 mov rdx,1ED90002CA8h

00007FFA37FBB4D1 mov rdx,qword ptr [rdx]

00007FFA37FBB4D4 add rdx,8

00007FFA37FBB4D8 mov r8,qword ptr [rdx]

00007FFA37FBB4DB mov r9,qword ptr [rdx+8]

c0 = Int128.op_MultiplyUnchecked(a, b);

00007FFA37FBB4DF lea r10,[rsp+20h]

00007FFA37FBB4E4 mov rdx,rax

00007FFA37FBB4E7 mulx rdx,r11,r8

00007FFA37FBB4EC mov qword ptr [r10],r11

00007FFA37FBB4EF mov r10,qword ptr [rsp+20h]

00007FFA37FBB4F4 imul rcx,r8

00007FFA37FBB4F8 imul rax,r9

00007FFA37FBB4FC add rax,rdx

00007FFA37FBB4FF add rax,rcx

00007FFA37FBB502 mov rdx,1ED90002CB0h

00007FFA37FBB50C mov rdx,qword ptr [rdx]

00007FFA37FBB50F add rdx,8

00007FFA37FBB513 mov qword ptr [rdx],r10

00007FFA37FBB516 mov qword ptr [rdx+8],rax

return 0;

00007FFA37FBB51A xor eax,eax

00007FFA37FBB51C add rsp,28h

00007FFA37FBB520 ret

和 Clang 比还差一点。不过已经很不错了。都能在 64 位 x86 上化为 3 条乘法指令。(但我不清楚这里 .NET 的 mulx 比 mul 有啥优势。)也都会插入些 mov 提高效率。(AOT 的结果还没测试。)

又例如 SIMD 的部分,C/C++ 的画风是这样的:

/*Project: SSBRendererFile: matrix.cppCopyright (c) 2015, Christoph "Youka" SpanknebelThis software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.3. This notice may not be removed or altered from any source distribution.*/

Matrix4x4d& Matrix4x4d::multiply(const Matrix4x4d& other) {

double * matrix1, * matrix2;

std::copy(this->matrix, this->matrix + 16, matrix1 = reinterpret_cast(&storage)), matrix2 = other.matrix;

__m256d m0 = _mm256_load_pd(matrix2),

m1 = _mm256_load_pd(matrix2 + 4),

m2 = _mm256_load_pd(matrix2 + 8),

m3 = _mm256_load_pd(matrix2 + 12);

_mm256_store_pd(

this->matrix,

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1), m0),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 1), m1),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 2), m2),

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 3), m3)

)

)

)

);

_mm256_store_pd(

this->matrix + 4,

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 4), m0),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 5), m1),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 6), m2),

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 7), m3)

)

)

)

);

_mm256_store_pd(

this->matrix + 8,

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 8), m0),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 9), m1),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 10), m2),

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 11), m3)

)

)

)

);

_mm256_store_pd(

this->matrix + 12,

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 12), m0),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 13), m1),

_mm256_add_pd(

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 14), m2),

_mm256_mul_pd(_mm256_broadcast_sd(matrix1 + 15), m3)

)

)

)

);

return *this;

}

而 C# 的画风是这样的:

// 示例代码,仅作参考。using System.Runtime.Intrinsics;

using System.Runtime.Intrinsics.X86;

struct Matrix4x4 {

Vector256 _Row0;

Vector256 _Row1;

Vector256 _Row2;

Vector256 _Row3;

public static unsafe Matrix4x4 operator *(Matrix4x4 first, Matrix4x4 second) {

var row = (double*)&first._Row0;

first._Row0 =

Avx.Add(Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(0 + row), second._Row0),

Avx.Multiply(Avx.BroadcastScalarToVector256(1 + row), second._Row1)),

Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(2 + row), second._Row2),

Avx.Multiply(Avx.BroadcastScalarToVector256(3 + row), second._Row3)));

row = (double*)&first._Row1;

first._Row1 =

Avx.Add(Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(0 + row), second._Row0),

Avx.Multiply(Avx.BroadcastScalarToVector256(1 + row), second._Row1)),

Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(2 + row), second._Row2),

Avx.Multiply(Avx.BroadcastScalarToVector256(3 + row), second._Row3)));

row = (double*)&first._Row2;

first._Row2 =

Avx.Add(Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(0 + row), second._Row0),

Avx.Multiply(Avx.BroadcastScalarToVector256(1 + row), second._Row1)),

Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(2 + row), second._Row2),

Avx.Multiply(Avx.BroadcastScalarToVector256(3 + row), second._Row3)));

row = (double*)&first._Row3;

first._Row3 =

Avx.Add(Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(0 + row), second._Row0),

Avx.Multiply(Avx.BroadcastScalarToVector256(1 + row), second._Row1)),

Avx.Add(Avx.Multiply(Avx.BroadcastScalarToVector256(2 + row), second._Row2),

Avx.Multiply(Avx.BroadcastScalarToVector256(3 + row), second._Row3)));

return first;

}

}

大体上类型安全但不失灵活性,又有太阳系第一 IDE 助阵。谁的研发投入会比较高,就不用我说了吧。游戏的运行性能,除非卡成 bugs,否则功能特性都撸不完的话,就不可能去优化。有 C/C++ 的轮子用就用。一些很 chatty 不 chuncky 的 API 在托管环境,肯定还是要靠 C# 写高性能的 .NET 库的。

回到主题,C/C++ 的代码编得稍稍差一点的话,在性能上就有可能被 C# 代码超过。特别是研发投入对等的情况下。Java 一般属于下一个梯队的。而且因为自身限制,很难优化上去。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值