问题描述
Are you tired of hacking?, take some rest here.
Just help me out with my small experiment regarding memcpy performance.
after that, flag is yours.
http://pwnable.kr/bin/memcpy.c
memcpy.c
// compiled with : gcc -o memcpy memcpy.c -m32 -lm
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mman.h>
#include <math.h>
unsigned long long rdtsc(){
asm("rdtsc");
}
char* slow_memcpy(char* dest, const char* src, size_t len){
int i;
for (i=0; i<len; i++) {
dest[i] = src[i];
}
return dest;
}
char* fast_memcpy(char* dest, const char* src, size_t len){
size_t i;
// 64-byte block fast copy
if(len >= 64){
i = len / 64;
len &= (64-1);
while(i-- > 0){
__asm__ __volatile__ (
"movdqa (%0), %%xmm0\n"
"movdqa 16(%0), %%xmm1\n"
"movdqa 32(%0), %%xmm2\n"
"movdqa 48(%0), %%xmm3\n"
"movntps %%xmm0, (%1)\n"
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
::"r"(src),"r"(dest):"memory");
dest += 64;
src += 64;
}
}
// byte-to-byte slow copy
if(len) slow_memcpy(dest, src, len);
return dest;
}
int main(void){
setvbuf(stdout, 0, _IONBF, 0);
setvbuf(stdin, 0, _IOLBF, 0);
printf("Hey, I have a boring assignment for CS class.. :(\n");
printf("The assignment is simple.\n");
printf("-----------------------------------------------------\n");
printf("- What is the best implementation of memcpy? -\n");
printf("- 1. implement your own slow/fast version of memcpy -\n");
printf("- 2. compare them with various size of data -\n");
printf("- 3. conclude your experiment and submit report -\n");
printf("-----------------------------------------------------\n");
printf("This time, just help me out with my experiment and get flag\n");
printf("No fancy hacking, I promise :D\n");
unsigned long long t1, t2;
int e;
char* src;
char* dest;
unsigned int low, high;
unsigned int size;
// allocate memory
char* cache1 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
char* cache2 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
src = mmap(0, 0x2000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
size_t sizes[10];
int i=0;
// setup experiment parameters
for(e=4; e<14; e++){ // 2^13 = 8K
low = pow(2,e-1);
high = pow(2,e);
printf("specify the memcpy amount between %d ~ %d : ", low, high);
scanf("%d", &size);
if( size < low || size > high ){
printf("don't mess with the experiment.\n");
exit(0);
}
sizes[i++] = size;
}
sleep(1);
printf("ok, lets run the experiment with your configuration\n");
sleep(1);
// run experiment
for(i=0; i<10; i++){
size = sizes[i];
printf("experiment %d : memcpy with buffer size %d\n", i+1, size);
dest = malloc( size );
memcpy(cache1, cache2, 0x4000); // to eliminate cache effect
t1 = rdtsc();
slow_memcpy(dest, src, size); // byte-to-byte memcpy
t2 = rdtsc();
printf("ellapsed CPU cycles for slow_memcpy : %llu\n", t2-t1);
memcpy(cache1, cache2, 0x4000); // to eliminate cache effect
t1 = rdtsc();
fast_memcpy(dest, src, size); // block-to-block memcpy
t2 = rdtsc();
printf("ellapsed CPU cycles for fast_memcpy : %llu\n", t2-t1);
printf("\n");
}
printf("thanks for helping my experiment!\n");
printf("flag : ----- erased in this source code -----\n");
return 0;
}
简单分析
slow_memcpy 是逐字节复制,fast_memcpy利用的是xmm寄存器无cache复制。不足64字节调用slow_memcpy
编译链接运行
这里在每次为dest申请空间后面加了一句,printf("dest addr :%p\n",dest);
$ gcc -o memcpy memcpy.c -m32 -lm
$ ./memcpy
Hey, I have a boring assignment for CS class.. :(
The assignment is simple.
-----------------------------------------------------
- What is the best implementation of memcpy? -
- 1. implement your own slow/fast version of memcpy -
- 2. compare them with various size of data -
- 3. conclude your experiment and submit report -
-----------------------------------------------------
This time, just help me out with my experiment and get flag
No fancy hacking, I promise :D
specify the memcpy amount between 8 ~ 16 : 8
specify the memcpy amount between 16 ~ 32 : 16
specify the memcpy amount between 32 ~ 64 : 32
specify the memcpy amount between 64 ~ 128 : 64
specify the memcpy amount between 128 ~ 256 : 128
specify the memcpy amount between 256 ~ 512 : 256
specify the memcpy amount between 512 ~ 1024 : 512
specify the memcpy amount between 1024 ~ 2048 : 1024
specify the memcpy amount between 2048 ~ 4096 : 2048
specify the memcpy amount between 4096 ~ 8192 : 4096
ok, lets run the experiment with your configuration
experiment 1 : memcpy with buffer size 8
ellapsed CPU cycles for slow_memcpy : 4620
dest addr :0x57f46410
ellapsed CPU cycles for fast_memcpy : 21792
experiment 2 : memcpy with buffer size 16
ellapsed CPU cycles for slow_memcpy : 828
dest addr :0x57f46420
ellapsed CPU cycles for fast_memcpy : 23100
experiment 3 : memcpy with buffer size 32
ellapsed CPU cycles for slow_memcpy : 768
dest addr :0x57f46438
ellapsed CPU cycles for fast_memcpy : 12456
experiment 4 : memcpy with buffer size 64
ellapsed CPU cycles for slow_memcpy : 1932
dest addr :0x57f46460
ellapsed CPU cycles for fast_memcpy : 14880
experiment 5 : memcpy with buffer size 128
ellapsed CPU cycles for slow_memcpy : 3192
dest addr :0x57f464a8
段错误
调试
$ gdb memcpy -q
Reading symbols from memcpy...(no debugging symbols found)...done.
gdb-peda$ set disassembly-flavor intel
gdb-peda$ r
Starting program: /home/pwd/Desktop/pwdmylife/pwnable/memcpy/memcpy
Hey, I have a boring assignment for CS class.. :(
The assignment is simple.
-----------------------------------------------------
- What is the best implementation of memcpy? -
- 1. implement your own slow/fast version of memcpy -
- 2. compare them with various size of data -
- 3. conclude your experiment and submit report -
-----------------------------------------------------
This time, just help me out with my experiment and get flag
No fancy hacking, I promise :D
specify the memcpy amount between 8 ~ 16 : 8
specify the memcpy amount between 16 ~ 32 : 16
specify the memcpy amount between 32 ~ 64 : 32
specify the memcpy amount between 64 ~ 128 : 64
specify the memcpy amount between 128 ~ 256 : 128
specify the memcpy amount between 256 ~ 512 : 256
specify the memcpy amount between 512 ~ 1024 : 512
specify the memcpy amount between 1024 ~ 2048 : 1024
specify the memcpy amount between 2048 ~ 4096 : 2048
specify the memcpy amount between 4096 ~ 8192 : 4096
ok, lets run the experiment with your configuration
experiment 1 : memcpy with buffer size 8
ellapsed CPU cycles for slow_memcpy : 5376
dest addr :0x56559410
ellapsed CPU cycles for fast_memcpy : 50632
experiment 2 : memcpy with buffer size 16
ellapsed CPU cycles for slow_memcpy : 544
dest addr :0x56559420
ellapsed CPU cycles for fast_memcpy : 20176
experiment 3 : memcpy with buffer size 32
ellapsed CPU cycles for slow_memcpy : 672
dest addr :0x56559438
ellapsed CPU cycles for fast_memcpy : 14136
experiment 4 : memcpy with buffer size 64
ellapsed CPU cycles for slow_memcpy : 1184
dest addr :0x56559460
ellapsed CPU cycles for fast_memcpy : 13944
experiment 5 : memcpy with buffer size 128
ellapsed CPU cycles for slow_memcpy : 2040
dest addr :0x565594a8
Program received signal SIGSEGV, Segmentation fault.
[----------------------------------registers-----------------------------------]
EAX: 0xf7fc8000 --> 0x0
EBX: 0x56558000 --> 0x2ee8
ECX: 0xffff9790 ("dest addr :0x565594a8\nr slow_memcpy : 2040\n\n2 : ")
EDX: 0x565594a8 --> 0x0
ESI: 0x1
EDI: 0xf7f55000 --> 0x1b2db0
EBP: 0xffffbca8 --> 0xffffbd38 --> 0x0
ESP: 0xffffbc98 --> 0xffffbcb4 --> 0xf7fc8000 --> 0x0
EIP: 0x5655588f (<fast_memcpy+62>: movntps XMMWORD PTR [edx],xmm0)
EFLAGS: 0x10202 (carry parity adjust zero sign trap INTERRUPT direction overflow)
[-------------------------------------code-------------------------------------]
0x56555880 <fast_memcpy+47>: movdqa xmm1,XMMWORD PTR [eax+0x10]
0x56555885 <fast_memcpy+52>: movdqa xmm2,XMMWORD PTR [eax+0x20]
0x5655588a <fast_memcpy+57>: movdqa xmm3,XMMWORD PTR [eax+0x30]
=> 0x5655588f <fast_memcpy+62>: movntps XMMWORD PTR [edx],xmm0
0x56555892 <fast_memcpy+65>: movntps XMMWORD PTR [edx+0x10],xmm1
0x56555896 <fast_memcpy+69>: movntps XMMWORD PTR [edx+0x20],xmm2
0x5655589a <fast_memcpy+73>: movntps XMMWORD PTR [edx+0x30],xmm3
0x5655589e <fast_memcpy+77>: add DWORD PTR [ebp+0x8],0x40
[------------------------------------stack-------------------------------------]
0000| 0xffffbc98 --> 0xffffbcb4 --> 0xf7fc8000 --> 0x0
0004| 0xffffbc9c --> 0xf7e13a25 (<__GI___libc_malloc+197>: test eax,eax)
0008| 0xffffbca0 --> 0x56558000 --> 0x2ee8
0012| 0xffffbca4 --> 0x1
0016| 0xffffbca8 --> 0xffffbd38 --> 0x0
0020| 0xffffbcac ("O\\UV\250\224UV")
0024| 0xffffbcb0 --> 0x565594a8 --> 0x0
0028| 0xffffbcb4 --> 0xf7fc8000 --> 0x0
[------------------------------------------------------------------------------]
Legend: code, data, rodata, value
Stopped reason: SIGSEGV
0x5655588f in fast_memcpy ()
分析
网上找到有关SEE指令movntps的资料
movntps m128,XMM m128 <== XMM 直接把XMM中的值送入m128,不经过cache,必须对齐16字节.
这里edx存放了dset的首地址,16字节对齐则要求该地址最后4位均为0,而且dest的空间是malloc申请的堆。
32位,堆的结构
| 4bytes (pre_size) |4bytes (size+ 3 bits flag|A|M|P)|
| data | |
solve
申请一定大小的空间,保证dest的地址的后4位为0
#!/usr/bin/env python
#coding:utf-8
#made by pwd
from pwn import *
import sys
import math
for i in xrange(4,15):
size = math.pow(2,i)
print "###########",i
for j in xrange(int(math.pow(2,i))):
tmp = size + j
tmp = 8 * ((tmp + 4) / 8 + 1)
tmp += 8
if tmp % 16 == 0:
print size + j
break