都2019了, Java官方也没有意向把自定义值类型加入标准, 只能说Java放弃了这小块领域, 让给C++去发挥优势了. 毕竟Java也占据了足够大的领域, 这些领域极少出现这种大量对齐小对象的情况.
其实, 支持自定义值类型也可能会对语言带来很复杂的设计, 对Java来说可能得不偿失. 像C#的struct虽然支持了自定义值类型, 但缺乏完整的引用语义, ref引用只能放在栈上而不能保存到堆中(如容器里). 如果被逼得用上unsafe指针(也有fixed和pin的局限性), 那又何苦不用C++呢.
话说回来, 如果真想让Java支持大量值对象的紧凑存储, 也不是不可能, 毕竟有Unsafe类这个既底层又高效,但也危险的类, 实现如下(需JDK9以上,JDK8以下需要做少量修改):
import java.lang.reflect.Field;
import java.util.Objects;
import sun.misc.Unsafe;
abstract class StructArray {
public static final Unsafe unsafe;
static {
try {
Field theUnsafeField = Class.forName("sun.misc.Unsafe").getDeclaredField("theUnsafe");
theUnsafeField.setAccessible(true);
unsafe = (Unsafe) theUnsafeField.get(null);
} catch (Exception e) {
throw new Error(e);
}
}
public final int count;
protected final byte[] mem;
protected StructArray(int stride, int count) {
this.count = count;
mem = new byte[stride * count];
}
}
// 根据定义"struct StructArrayABCD { long a; int b; float c; double d; }"可以生成如下类class StructArrayABCD extends StructArray {
public static final int OFFSET_A = Unsafe.ARRAY_BYTE_BASE_OFFSET;
public static final int OFFSET_B = OFFSET_A + 8;
public static final int OFFSET_C = OFFSET_B + 4;
public static final int OFFSET_D = OFFSET_C + 4;
public static final int STRIDE = 8 + 4 + 4 + 8;
public StructArrayABCD(int count) {
super(STRIDE, count);
}
public long getA(int idx) {
Objects.checkIndex(idx, count);
return unsafe.getLong(mem, STRIDE * idx + OFFSET_A);
}
public void setA(int idx, long a) {
Objects.checkIndex(idx, count);
unsafe.putLong(mem, STRIDE * idx + OFFSET_A, a);
}
public int getB(int idx) {
Objects.checkIndex(idx, count);
return unsafe.getInt(mem, STRIDE * idx + OFFSET_B);
}
public void setB(int idx, int b) {
Objects.checkIndex(idx, count);
unsafe.putInt(mem, STRIDE * idx + OFFSET_B, b);
}
public float getC(int idx) {
Objects.checkIndex(idx, count);
return unsafe.getFloat(mem, STRIDE * idx + OFFSET_C);
}
public void setC(int idx, float c) {
Objects.checkIndex(idx, count);
unsafe.putFloat(mem, STRIDE * idx + OFFSET_C, c);
}
public double getD(int idx) {
Objects.checkIndex(idx, count);
return unsafe.getDouble(mem, STRIDE * idx + OFFSET_D);
}
public void setD(int idx, double d) {
Objects.checkIndex(idx, count);
unsafe.putDouble(mem, STRIDE * idx + OFFSET_D, d);
}
}
public class StructArrayTest {
public static void main(String[] args) {
final int TEST_COUNT = 1_000_000;
StructArrayABCD s = new StructArrayABCD(TEST_COUNT);
for (int i = 0; i < TEST_COUNT; ++i) {
s.setA(i, i * 123456789L);
s.setB(i, i * 987654321);
s.setC(i, i * 3.1415926f);
s.setD(i, i * 2.7182818);
}
long t = System.nanoTime();
for (int j = 0; j < 10; ++j) {
for (int i = 0; i < TEST_COUNT; ++i) {
s.setA(i, s.getA(i) + s.getB(((i * 123456789) & 0x7fff_ffff) % TEST_COUNT));
s.setB(i, s.getB(i) - (int) s.getA(((i * 987654321) & 0x7fff_ffff) % TEST_COUNT));
s.setC(i, s.getC(i) + (float) s.getD(((i * 123456789) & 0x7fff_ffff) % TEST_COUNT));
s.setD(i, s.getD(i) - s.getC(((i * 987654321) & 0x7fff_ffff) % TEST_COUNT));
}
}
System.out.format("%d, %d, %f, %f, %d ms\n", s.getA(TEST_COUNT - 1), s.getB(TEST_COUNT - 1),
s.getC(TEST_COUNT - 1), s.getD(TEST_COUNT - 1), (System.nanoTime() - t) / 1_000_000);
}
}
我的测试PC(Win10 64-bit; Intel I5-4430 (3GHz), OpenJDK 11.0.2 (64-bit))输出结果是:123447909965457, 738911949, -2981082368.000000, 71560194153.706730, 572 ms
算下来每个循环只需150个CPU时钟, 猜测主要消耗是内存读写的cache miss率很高(24MB内存块的随机访问). 值得注意的是, 即使用了Unsafe类危险的"黑魔法", StructArrayABCD类提供的公开方法仍然是非常安全的. 如果去掉了检查idx参数是否合法的"Objects.checkIndex"验证, 并不能明显提升性能(<5%), 这在非如此密集访问的情况下影响更是微乎其微.
再给出对应的C++参考实现:
#include #include using namespace std;
using namespace std::chrono;
struct StructArrayABCD {
long long a;
int b;
float c;
double d;
};
int main() {
const int TEST_COUNT = 1000000;
StructArrayABCD* s = new StructArrayABCD[TEST_COUNT];
for (int i = 0; i < TEST_COUNT; ++i) {
s[i].a = i * 123456789LL;
s[i].b = i * 987654321;
s[i].c = i * 3.1415926f;
s[i].d = i * 2.7182818;
}
auto t = duration_cast(system_clock::now().time_since_epoch()).count();
for (int j = 0; j < 10; ++j) {
for (int i = 0; i < TEST_COUNT; ++i) {
s[i].a += s[((i * 123456789) & 0x7fffffff) % TEST_COUNT].b;
s[i].b -= (int)s[((i * 987654321) & 0x7fffffff) % TEST_COUNT].a;
s[i].c += (float)s[((i * 123456789) & 0x7fffffff) % TEST_COUNT].d;
s[i].d -= s[((i * 987654321) & 0x7fffffff) % TEST_COUNT].c;
}
}
cout << s[TEST_COUNT - 1].a << ", " << s[TEST_COUNT - 1].b << ", " << s[TEST_COUNT - 1].c << ", " << s[TEST_COUNT - 1].d
<< ", " << duration_cast(system_clock::now().time_since_epoch()).count() - t << "ms" << endl;
return 0;
}
我的测试PC(Win10 64-bit; Intel I5-4430 (3GHz), MingW GCC 8.2.0 (64-bit))输出结果是:123447909965457, 738911949, -2.98108e+09, 7.15602e+10, 432ms
值得注意的是, 这个结果是用"-m64 -O1"参数来编译的, 而用"-O2"及更高的优化会输出错误的结果, 而且所用时间也少不了400ms.
看来Java的读写时间开销比C++多了1/3, 也许不少需求可以接受(主要是内存需求得到了满足), 毕竟这块领域还是C++更灵活更有优势.