Part I to write y-86 programs
recursive sum_list
c code
typedef struct ELE {
long val;
struct ELE *next;
} *list_ptr;
/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
if (!ls)
return 0;
else {
long val = ls->val;
long rest = rsum_list(ls->next);
return val + rest;
}
}
y-86 code
main:
irmovq ele1 , %rdi
call rsum_list
ret #pay attention!
#int rsum_list(list_ptr ls)
rsum_list:
andq %rdi,%rdi #if ls=0,return 0
je return_zero
mrmovq (%rdi),%r9 #ls->val
pushq %r9 #use stack to implement recursion & addition
mrmovq 8(%rdi),%rdi #ls->next
call rsum_list
popq %r9
addq %r9,%rax
jmp end
return_zero:
irmovq $0,%rax
end:
ret
copy a source block to a destination block
c code
long copy_block(long *src, long *dest, long len)
{
long result = 0;
while (len > 0) {
long val = *src++;
*dest++ = val;
result ^= val;
len--;
}
return result;
}
y-86 code
main:
irmovq src , %rdi
irmovq dest , %rsi
irmovq $3,%rdx
call copy
ret #pay attention!
copy:
irmovq $1,%r8
irmovq $8,%r9 #*src++ move 8 bit
irmovq $0,%rax #result=0
loop:
mrmovq (%rdi),%r10 #val=*src++
addq %r9,%rdi
rmmovq %r10,(%rsi) #*dest++=val
addq %r9,%rsi
xorq %r10,%rax #result^=val
subq %r8,%rdx #len--
jne loop
ret
tips
1 order of ++:
val=*src++ --> val=src; src++;
*dest++=val --> *dest=val;*dest++
2 pointer
*dest++ add 8 bits!!
SEQ processor
to implement iaddq in seq processor
an easy task
Status = HLT
Condition Codes: Z=1 S=0 O=0
Changed Register State:
%rax: 0x0000000000000000 0x0000abcdabcdabcd
%rsp: 0x0000000000000000 0x0000000000000100
%rdi: 0x0000000000000000 0x0000000000000038
%r10: 0x0000000000000000 0x0000a000a000a000
Changed Memory State:
0x00f0: 0x0000000000000000 0x0000000000000055
0x00f8: 0x0000000000000000 0x0000000000000013
ISA Check Succeeds
pipeline
implement iaddq
edit pipeline-full
average CPE 15.18–>12.70
Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:
Loop:
mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
iaddq $8, %rdi # src++
iaddq $8, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:
Loop Unrolling
Why
1 reduce unnecessary loop indexing and conditional branching
2 expose ways to further transform the code
# data hazard
mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
#resolution -- to get data in advance
Loop: mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi),%r9
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
jne Loop1
ret
Loop1:
mrmovq 16(%rdi),%r8
rmmovq %r9,8(%rsi)
andq %r9,%r9
jle Npos1
iaddq $1,%rax
Npos1:
iaddq $-1,%rdx
jne Loop2
ret
## control hazard
Loop:
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi),%r9
andq %r10, %r10 # val <= 0?
mrmovq 16(%rdi),%r8
rmmovq %r10,(%rsi)
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
jne Loop1
ret
in the end, loop unrooling for 8 times
CPE:8.96
remainder
in stead of checking whether len–>0 every time
check if len>9
divide the cycle into 2 parts: when len >9 go through the cycle without len check
when len <9 jump to remain part
use a 3-ary tree to decide when to stop/begin
iaddq $-9,%rdx
jl remain
Loop:
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi),%r9
andq %r10, %r10 # val <= 0?
mrmovq 16(%rdi),%r8
rmmovq %r10,(%rsi)
jle Loop1 # if so, goto Npos:
iaddq $1, %rax # count++
Loop1:
andq %r9,%r9
rmmovq %r9 , 8(%rsi)
mrmovq 24(%rdi),%r10
jle Loop2
iaddq $1,%rax
Loop2:
andq %r8,%r8
rmmovq %r8 ,16 (%rsi)
mrmovq 32(%rdi),%r9
jle Loop3
iaddq $1,%rax
Loop3:
andq %r10,%r10
rmmovq %r10 , 24(%rsi)
mrmovq 40(%rdi),%r8
jle Loop4
iaddq $1,%rax
Loop4:
andq %r9,%r9
rmmovq %r9 , 32(%rsi)
mrmovq 48(%rdi),%r10
jle Loop5
iaddq $1,%rax
Loop5:
andq %r8,%r8
rmmovq %r8 , 40(%rsi)
mrmovq 56(%rdi),%r9
jle Loop6
iaddq $1,%rax
Loop6:
andq %r10,%r10
mrmovq 64(%rdi),%r8
rmmovq %r10 , 48(%rsi)
jle Loop7
iaddq $1,%rax
Loop7:
andq %r9,%r9
rmmovq %r9 , 56(%rsi)
jle Loop8
iaddq $1,%rax
Loop8:
andq %r8,%r8
rmmovq %r8,64(%rsi)
jle test
iaddq $1,%rax
test:
iaddq $72,%rdi
iaddq $72,%rsi
iaddq $-9,%rdx
jge Loop
remain:
mrmovq 56(%rdi),%r10
iaddq $1,%rdx
jl r7
andq %r10,%r10
rmmovq %r10,56(%rsi)
jle r7
iaddq $1,%rax
r7:
mrmovq 48(%rdi),%r9
iaddq $1,%rdx
jl r6
andq %r9,%r9
rmmovq %r9,48(%rsi)
jle r6
iaddq $1,%rax
r6:
mrmovq 40(%rdi),%r8
iaddq $1,%rdx
jl r5
andq %r8,%r8
rmmovq %r8,40(%rsi)
jle r5
iaddq $1,%rax
r5:
mrmovq 32(%rdi),%r10
iaddq $1,%rdx
jl r4
andq %r10,%r10
rmmovq %r10,32(%rsi)
jle r4
iaddq $1,%rax
r4:
mrmovq 24(%rdi),%r9
iaddq $1,%rdx
jl r3
andq %r9,%r9
rmmovq %r9,24(%rsi)
jle r3
iaddq $1,%rax
r3:
mrmovq 16(%rdi),%r8
iaddq $1,%rdx
jl r2
andq %r8,%r8
rmmovq %r8,16(%rsi)
jle r2
iaddq $1,%rax
r2:
mrmovq 8(%rdi),%r10
iaddq $1,%rdx
jl r1
andq %r10,%r10
rmmovq %r10,8(%rsi)
jle r1
iaddq $1,%rax
r1:
mrmovq (%rdi),%r9
iaddq $1,%rdx
jl Done
andq %r9,%r9
rmmovq %r9,(%rsi)
jle Done
iaddq $1,%rax
CPE 33.8
use 3-ary tree to reduce searc time
remain: iaddq $6,%rdx
jl Tr1 # <3
jg Tr3
Tr2:
jmp r3 #3
Tr1:
iaddq $2,%rdx
je r1 #1
iaddq $-1,%rdx
je r2 #2
ret
Tr31:
iaddq $1,%rdx
jl r4
jmp r5
Tr3:
iaddq $-3,%rdx
jl Tr31 #<6
jg Tr33
jmp r6 #6
Tr33:
iaddq $-1,%rdx
je r7
r8:
mrmovq 56(%rdi),%r10
rmmovq %r10,56(%rsi)
andq %r10,%r10
r7:
mrmovq 48(%rdi),%r9
jle r72
iaddq $1,%rax
r72:
rmmovq %r9,48(%rsi)
andq %r9,%r9
r6:
mrmovq 40(%rdi),%r8
jle r62
iaddq $1,%rax
r62:
rmmovq %r8,40(%rsi)
andq %r8,%r8
r5:
mrmovq 32(%rdi),%r10
jle r52
iaddq $1,%rax
r52:
rmmovq %r10,32(%rsi)
andq %r10,%r10
r4:
mrmovq 24(%rdi),%r9
jle r42
iaddq $1,%rax
r42:
rmmovq %r9,24(%rsi)
andq %r9,%r9
r3:
mrmovq 16(%rdi),%r8
jle r32
iaddq $1,%rax
r32:
rmmovq %r8,16(%rsi)
andq %r8,%r8
r2:
mrmovq 8(%rdi),%r10
jle r22
iaddq $1,%rax
r22:
rmmovq %r10,8(%rsi)
andq %r10,%r10
r1:
mrmovq (%rdi),%r9
jle r12
iaddq $1,%rax
r12:
rmmovq %r9,(%rsi)
andq %r9,%r9
jle Done
iaddq $1,%rax
Part 4 some questions
y-86 code
1 how does CF SF ZF change when and?
E.G. if %r10<=0 jump to next
andq %r10,%r10
jle Next