Linux sse 地址对齐指令,转：汇编学习(一):内存对齐与SSE初探

最新推荐文章于 2021-05-12 19:47:02 发布

weixin_39517546

最新推荐文章于 2021-05-12 19:47:02 发布

阅读量187

点赞数

文章标签： Linux sse 地址对齐指令

第一次发文，正好复习一下这几天的汇编，与各位同仁分享

顺便说下，我是汇编菜鸟，高手们还请自动回避==|

实地学习ASM，再没有比看现成的，带注释的，而且商业化的代码更好的教材了。

机子上装了VC2008，C:\Program Files\Microsoft Visual Studio

9.0\VC\crt目录下有微软的源代码。

进入intel目录，可以发现C标准函数的asm实现，我们就从其中最简单的memset开研究。

/************************************************* **********

//C原型

char * memset (char *dstdst, char value, unsigned int count)

{

char *start = dst;

while (count--)

*dst++ = value;

return(start);

}

/************************************************* **********

;************************************************* **********

;ASM实现

[code]CODESEG

extrn _VEC_memzero:near

extrn __sse2_available:dword

public memset

memset proc \

dst:ptr byte, \

value:byte, \

count:dword

OPTION PROLOGUE:NONE, EPILOGUE:NONE

.FPO ( 0, 3, 0, 0, 0, 0 )

mov edx,[esp + 0ch] ; edx = "count"

mov ecx,[esp + 4] ; ecx points to "dst"

test edx,edx ; 0?

jz short toend ; if so, nothing to do

xor eax,eax

mov al,[esp + 8] ; the byte "value" to be stored

;检测是否置0，是否操作大于0x100字节的数据，是否支持SSE

; Special case large block zeroing using SSE2 support

test al,al ; memset using zero initializer?

jne dword_align

cmp edx,0100h ; block size exceeds size threshold?

jb dword_align

cmp DWORD PTR __sse2_available,0 ; SSE2 supported?

je dword_align

jmp _VEC_memzero ; use fast zero SSE2 implementation

; no return

; Align address on dword boundary

;检测是否有不对齐的首部

dword_align:

push edi ; preserve edi

mov edi,ecx ; edi = dest pointer

cmp edx,4 ; if it's less then 4 bytes

jb tail ; tail needs edi and edx to be initialized

neg ecx

and ecx,3 ; ecx = # bytes before dword boundary

jz short dwords ; jump if address already aligned

sub edx,ecx ; edx = adjusted count (for later)

;填充不对齐的首部

adjust_loop:

mov [edi],al

add edi,1

sub ecx,1

jnz adjust_loop

;获取一个四字节value的EAX

dwords:

; set all 4 bytes of eax to [value]

mov ecx,eax ; ecx=0/0/0/value

shl eax,8 ; eax=0/0/value/0

add eax,ecx ; eax=0/0val/val

mov ecx,eax ; ecx=0/0/val/val

shl eax,10h ; eax=val/val/0/0

add eax,ecx ; eax = all 4 bytes = [value]

; Set dword-sized blocks

mov ecx,edx ; move original count to ecx

and edx,3 ; prepare in edx byte count (for tail loop)

shr ecx,2 ; adjust ecx to be dword count

jz tail ; jump if it was less then 4 bytes

;核心操作rep stosd

rep stosd

main_loop_tail:

test edx,edx ; if there is no tail bytes,

jz finish ; we finish, and it's time to leave

; Set remaining bytes

填充尾部的不对齐区域

tail:

mov [edi],al ; set remaining bytes

add edi,1

sub edx,1 ; if there is some more bytes

jnz tail ; continue to fill them

; Done

finish:

mov eax,[esp + 8] ; return dest pointer

pop edi ; restore edi

ret

toend:

mov eax,[esp + 4] ; return dest pointer

ret

memset endp

end

;************************************************* *********

1. 对齐

大家可能已经发现，对edi赋值，再rep

stosd就是memset函数的核心操作，那为什么还有这么多“冗余”代码呢？仔细阅读注释，发现有两处对内存对齐(align)的处理：

首先，目的地址可能不是4字节(32位)的整数倍，如有需要，要在目的地址的前部首先填充那少于4字节的部分；

其次，目的内存块的末尾地址也可能不是4字节的整数倍，在程序进行的末尾，需要特别处理。

这样一来，去除目的内存块的不对齐首尾部，就可以在对齐的中间区域以4字节为单位写入了。

可以看到，整篇代码几乎都在处理对齐问题，为什么对齐这么重要呢？

查阅了资料，原文如下：

一个字或双字操作数跨越了4字节边界，或者一个四字操作数跨越了8字节边界，被认为是未对齐的，从而需要两次总线周期来访问内存。一个字起始地址是奇数但却没有跨越字边界被认为是对齐的，能够在一个总线周期中被访问。

weixin_39517546

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Linux sse 地址对齐指令,转：汇编学习(一):内存对齐与SSE初探

第一次发文，正好复习一下这几天的汇编，与各位同仁分享顺便说下，我是汇编菜鸟，高手们还请自动回避==|实地学习ASM，再没有比看现成的，带注释的，而且商业化的代码更好的教材了。机子上装了VC2008，C:\Program Files\Microsoft Visual Studio9.0\VC\crt目录下有微软的源代码。进入intel目录，可以发现C标准函数的asm实现，我们就从其中最简单的mems...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。