用x64汇编优化8位S盒置换(二)

在论坛上有人提出,可以考虑用空间换取时间的方式来优化S盒置换,使用8位S盒,4字节置换需要进行4次查表,哪怕是S盒数组为了内存对齐而采用64位数据宽度,其本质仍旧是8位S盒数组,数据单元总数依旧是256个,对于本文所涉及的课题,理论上可以用16位甚至是32位S盒数组,前者进行4字节置换仅需2次查表操作,后者则一步到位,1次差表就完成了4字节置换,然而从实用角度而言,16位S盒数组是唯一可行的方案,原因很简单:32位S盒数组至少要占用 (2 ^ 32) x 4 = 16GB内存,这不但浪费内存而且会因为频繁大范围内存寻址造成CPU缓存波动。

首先是内存最少的方式,S盒数组下标范围16位,每个数据单元16位,总计占用 65536 x 2 = 128KB内存,完整代码如下:

#include <stdint.h>

static uint8_t s_sbox8[256] = {
    0xd6,0x90,0xe9,0xfe,0xcc,0xe1,0x3d,0xb7,0x16,0xb6,0x14,0xc2,0x28,0xfb,0x2c,0x05,
    0x2b,0x67,0x9a,0x76,0x2a,0xbe,0x04,0xc3,0xaa,0x44,0x13,0x26,0x49,0x86,0x06,0x99,
    0x9c,0x42,0x50,0xf4,0x91,0xef,0x98,0x7a,0x33,0x54,0x0b,0x43,0xed,0xcf,0xac,0x62,
    0xe4,0xb3,0x1c,0xa9,0xc9,0x08,0xe8,0x95,0x80,0xdf,0x94,0xfa,0x75,0x8f,0x3f,0xa6,
    0x47,0x07,0xa7,0xfc,0xf3,0x73,0x17,0xba,0x83,0x59,0x3c,0x19,0xe6,0x85,0x4f,0xa8,
    0x68,0x6b,0x81,0xb2,0x71,0x64,0xda,0x8b,0xf8,0xeb,0x0f,0x4b,0x70,0x56,0x9d,0x35,
    0x1e,0x24,0x0e,0x5e,0x63,0x58,0xd1,0xa2,0x25,0x22,0x7c,0x3b,0x01,0x21,0x78,0x87,
    0xd4,0x00,0x46,0x57,0x9f,0xd3,0x27,0x52,0x4c,0x36,0x02,0xe7,0xa0,0xc4,0xc8,0x9e,
    0xea,0xbf,0x8a,0xd2,0x40,0xc7,0x38,0xb5,0xa3,0xf7,0xf2,0xce,0xf9,0x61,0x15,0xa1,
    0xe0,0xae,0x5d,0xa4,0x9b,0x34,0x1a,0x55,0xad,0x93,0x32,0x30,0xf5,0x8c,0xb1,0xe3,
    0x1d,0xf6,0xe2,0x2e,0x82,0x66,0xca,0x60,0xc0,0x29,0x23,0xab,0x0d,0x53,0x4e,0x6f,
    0xd5,0xdb,0x37,0x45,0xde,0xfd,0x8e,0x2f,0x03,0xff,0x6a,0x72,0x6d,0x6c,0x5b,0x51,
    0x8d,0x1b,0xaf,0x92,0xbb,0xdd,0xbc,0x7f,0x11,0xd9,0x5c,0x41,0x1f,0x10,0x5a,0xd8,
    0x0a,0xc1,0x31,0x88,0xa5,0xcd,0x7b,0xbd,0x2d,0x74,0xd0,0x12,0xb8,0xe5,0xb4,0xb0,
    0x89,0x69,0x97,0x4a,0x0c,0x96,0x77,0x7e,0x65,0xb9,0xf1,0x09,0xc5,0x6e,0xc6,0x84,
    0x18,0xf0,0x7d,0xec,0x3a,0xdc,0x4d,0x20,0x79,0xee,0x5f,0x3e,0xd7,0xcb,0x39,0x48
};

static uint16_t s_sbox[65536];

void sbox_init(void)
{
        uint64_t        i;
        for(i = 0; i < 65536; i++) {
                s_sbox[i] = s_sbox8[i & 0xff] ^ (s_sbox8[i >> 8] << 8);
        }
}

uint32_t sbox(uint32_t src)
{
        uint32_t dst;

        dst = s_sbox[src & 0xffff] ^ (s_sbox[(src >> 16)] << 16);

        return(dst);
}

在原有的8位S盒数组的基础上,构造一个16位S盒数组,使用前须初始化,对应的测试程序代码变成:

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>

uint32_t sbox(uint32_t src);
void sbox_init(void);

int main(int argc, char *argv[])
{
        uint32_t        i, data;

        sbox_init();
        data = 0x00010203;

        for(i = 0; i < 100000000; i++) {
                data = sbox(data);
        }

        printf("data = %08x\n", data);

        exit(EXIT_SUCCESS);
}

编译并运行,得到如下结果:

[root@sxy-lenovo step4]# time ./test_sbox
data = 9acd23e0

real	0m0.460s
user	0m0.458s
sys	0m0.000s

相比8位S盒数组的版本,性能提升是明显的。

既然如此,可以考虑将16位S盒的数据单元大小从16位提升到64位,看看内存对齐对于16位S盒有何影响。只要将文件sbox.c的uint16_t数据类型改成uint64_t就行了:

#include <stdint.h>

static uint8_t s_sbox8[256] = {
    0xd6,0x90,0xe9,0xfe,0xcc,0xe1,0x3d,0xb7,0x16,0xb6,0x14,0xc2,0x28,0xfb,0x2c,0x05,
    0x2b,0x67,0x9a,0x76,0x2a,0xbe,0x04,0xc3,0xaa,0x44,0x13,0x26,0x49,0x86,0x06,0x99,
    0x9c,0x42,0x50,0xf4,0x91,0xef,0x98,0x7a,0x33,0x54,0x0b,0x43,0xed,0xcf,0xac,0x62,
    0xe4,0xb3,0x1c,0xa9,0xc9,0x08,0xe8,0x95,0x80,0xdf,0x94,0xfa,0x75,0x8f,0x3f,0xa6,
    0x47,0x07,0xa7,0xfc,0xf3,0x73,0x17,0xba,0x83,0x59,0x3c,0x19,0xe6,0x85,0x4f,0xa8,
    0x68,0x6b,0x81,0xb2,0x71,0x64,0xda,0x8b,0xf8,0xeb,0x0f,0x4b,0x70,0x56,0x9d,0x35,
    0x1e,0x24,0x0e,0x5e,0x63,0x58,0xd1,0xa2,0x25,0x22,0x7c,0x3b,0x01,0x21,0x78,0x87,
    0xd4,0x00,0x46,0x57,0x9f,0xd3,0x27,0x52,0x4c,0x36,0x02,0xe7,0xa0,0xc4,0xc8,0x9e,
    0xea,0xbf,0x8a,0xd2,0x40,0xc7,0x38,0xb5,0xa3,0xf7,0xf2,0xce,0xf9,0x61,0x15,0xa1,
    0xe0,0xae,0x5d,0xa4,0x9b,0x34,0x1a,0x55,0xad,0x93,0x32,0x30,0xf5,0x8c,0xb1,0xe3,
    0x1d,0xf6,0xe2,0x2e,0x82,0x66,0xca,0x60,0xc0,0x29,0x23,0xab,0x0d,0x53,0x4e,0x6f,
    0xd5,0xdb,0x37,0x45,0xde,0xfd,0x8e,0x2f,0x03,0xff,0x6a,0x72,0x6d,0x6c,0x5b,0x51,
    0x8d,0x1b,0xaf,0x92,0xbb,0xdd,0xbc,0x7f,0x11,0xd9,0x5c,0x41,0x1f,0x10,0x5a,0xd8,
    0x0a,0xc1,0x31,0x88,0xa5,0xcd,0x7b,0xbd,0x2d,0x74,0xd0,0x12,0xb8,0xe5,0xb4,0xb0,
    0x89,0x69,0x97,0x4a,0x0c,0x96,0x77,0x7e,0x65,0xb9,0xf1,0x09,0xc5,0x6e,0xc6,0x84,
    0x18,0xf0,0x7d,0xec,0x3a,0xdc,0x4d,0x20,0x79,0xee,0x5f,0x3e,0xd7,0xcb,0x39,0x48
};

static uint64_t s_sbox[65536];

void sbox_init(void)
{
        uint64_t        i;
        for(i = 0; i < 65536; i++) {
                s_sbox[i] = s_sbox8[i & 0xff] ^ (s_sbox8[i >> 8] << 8);
        }
}

uint64_t sbox(uint64_t src)
{
        uint64_t dst;

        dst = s_sbox[src & 0xffff] ^ (s_sbox[(src >> 16)] << 16);

        return(dst);
}

编译运行测试程序,结果是:

[root@sxy-lenovo step5]# time ./test_sbox
data = 9acd23e0

real	0m0.840s
user	0m0.838s
sys	0m0.002s

这不是我想要的结果,这不科学!!!64位数据单元可能过大了,改成32位再试试:

[root@sxy-lenovo step6]# time ./test_sbox
data = 9acd23e0

real	0m0.530s
user	0m0.529s
sys	0m0.000s

这个结果比64位数据单元的版本强,依旧不如16位数据单元的版本,究竟是什么导致了性能劣化呢,答案是显而易见的:CPU缓存

[root@sxy-lenovo step6]# cat /proc/cpuinfo 
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 37
model name	: Intel(R) Core(TM) i3 CPU       M 370  @ 2.40GHz
stepping	: 5
microcode	: 4
cpu MHz		: 2399.000
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 11
wp		: yes

运行程序的是一个老款笔记本电脑,其CPU的一级缓存为64KB,二级缓存为256KB,三级缓存为3MB,最初的8位S盒数组数据单元仅8位,占用了256字节,即便扩展到64位数据单元,也占用不过2KB内存,若使用16位S盒,当数据单元为16位时,占用128KB内存,当数据单元到达32位时,内存占用到了256KB,再依据测试消耗时间就可以得到初步结论:若S盒数组占用内存小于当前CPU的二级缓存大小时,性能将达到最优,若超出反而会变差。

到这里,可以明确地说:C语言的S盒置换优化之路走到了尽头,事态的发展真的是走投无路了吗?搞过单片机或是FPGA的都知道,通用CPU实施对称密码算法运算几乎无优势可言,反而是专用芯片无论是在成本还是在综合性能上都牢牢占据优势。造成这一现象的最主要的矛盾来自于CPU主频和内存芯片主频的巨大差异,大家都知道,限于PC平台的成本要求,与其采用高频内存芯片,还不如提高CPU缓存性能与大小更合算。解决S盒置换的终极方案之一早已在通用CPU中加入,以Intel为例,所谓的AESNI指令就是干这个的,使用AESNI指令进行AES算法运算能大幅度提高性能,里面的关键点就是抛弃了传统的S盒查表置换方式,直接用内置硬件计算AES算法专用的GF2^8有限域乘法和乘法逆,这使得AES算法在支持AESNI指令集的CPU上获得超乎想像的性能增益,而对于本文涉及的国密SMS4算法而言,AESNI是没有任何意义的,要想突破查表法的性能限制,就必须走另外的路。

 

转载于:https://my.oschina.net/safedead/blog/832940

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
AES的S 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 63 7C 77 7B F2 6B 6F C5 30 01 67 2B FE D7 AB 76 1 CA 82 C9 7D FA 59 47 F0 AD D4 A2 AF 9C A4 72 C0 2 B7 FD 93 26 36 3F F7 CC 34 A5 E5 F1 71 D8 31 15 3 04 C7 23 C3 18 96 05 9A 07 12 80 E2 EB 27 B2 75 4 09 83 2C 1A 1B 6E 5A A0 52 3B D6 B3 29 E3 2F 84 5 53 D1 00 ED 20 FC B1 5B 6A CB BE 39 4A 4C 58 CF 6 D0 EF AA FB 43 4D 33 85 45 F9 02 7F 50 3C 9F A8 7 51 A3 40 8F 92 9D 38 F5 BC B6 DA 21 10 FF F3 D2 8 CD 0C 13 EC 5F 97 44 17 C4 A7 7E 3D 64 5D 19 73 9 60 81 4F DC 22 2A 90 88 46 EE B8 14 DE 5E 0B DB A E0 32 3A 0A 49 06 24 5C C2 D3 AC 62 91 95 E4 79 B E7 C8 37 6D 8D D5 4E A9 6C 56 F4 EA 65 7A AE 08 C BA 78 25 2E 1C A6 B4 C6 E8 DD 74 1F 4B BD 8B 8A D 70 3E B5 66 48 03 F6 0E 61 35 57 B9 86 C1 1D 9E E E1 F8 98 11 69 D9 8E 94 9B 1E 87 E9 CE 55 28 DF F 8C A1 89 0D BF E6 42 68 41 99 2D 0F B0 54 BB 16 ################################################################################ 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 52 09 6A D5 30 36 A5 38 BF 40 A3 9E 81 F3 D7 FB 1 7C E3 39 82 9B 2F FF 87 34 8E 43 44 C4 DE E9 CB 2 54 7B 94 32 A6 C2 23 3D EE 4C 95 0B 42 FA C3 4E 3 08 2E A1 66 28 D9 24 B2 76 5B A2 49 6D 8B D1 25 4 72 F8 F6 64 86 68 98 16 D4 A4 5C CC 5D 65 B6 92 5 6C 70 48 50 FD ED B9 DA 5E 15 46 57 A7 8D 9D 84 6 90 D8 AB 00 8C BC D3 0A F7 E4 58 05 B8 B3 45 06 7 D0 2C 1E 8F CA 3F 0F 02 C1 AF BD 03 01 13 8A 6B 8 3A 91 11 41 4F 67 DC EA 97 F2 CF CE F0 B4 E6 73 9 96 AC 74 22 E7 AD 35 85 E2 F9 37 E8 1C 75 DF 6E A 47 F1 1A 71 1D 29 C5 89 6F B7 62 0E AA 18 BE 1B B FC 56 3E 4B C6 D2 79 20 9A DB C0 FE 78 CD 5A F4 C 1F DD A8 33 88 07 C7 31 B1 12 10 59 27 80 EC 5F D 60 51 7F A9 19 B5 4A 0D 2D E5 7A 9F 93 C9 9C EF E A0 E0 3B 4D AE 2A F5 B0 C8 EB BB 3C 83 53 99 61 F 17 2B 04 7E BA 77 D6 26 E1 69 14 63 55 21 0C 7D
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值