彻底理解setjmp/longjmp并DIY一个简单的协程

最新推荐文章于 2022-12-06 12:43:43 发布

dog250

最新推荐文章于 2022-12-06 12:43:43 发布

阅读量9.9k

点赞数 26

文章标签： setjmp longjmp 协程用户态多线程

本文链接：https://blog.csdn.net/dog250/article/details/89742140

版权

上海昔日的邻居来杭州，我们小聚。今天一起带着小小去了西溪湿地，体验并不是太好，门票太贵，进去需乘船，船票快顶得上门票了，简直就是要抢劫啊！不过，景点嘛，一般我是不去的。

缘由

下面的代码写于5月2日(也就是今天刚开始的时候)凌晨1点钟，假期首日逛玩了一天，晚上自己写个东西玩玩，主要是因为之前想用setjmp/longjmp通过修改jmp_buf结构体的某些寄存器字段而实现协程而没有成功，就自己做个类似的，突发奇想，就赶紧动手尝试，很简单，前后半小时吧，就成功了。

我在5月2日凌晨两点左右发过朋友圈了，直接贴的就是代码，没别的。所以我想在博客里再多说一点。代码嘛，不重要，心里想表达的一些想法在我看来才更重要。

在很多人看来，我这么简单的代码看起来非常low，但是谁让我不会编程写不出什么复杂的东西呢？唉，也是郁闷！

不过，我先抛出我的观点，不是说简单就一定很low，简单之所以让人觉得很low，一般是因为简单的东西看起来没有工作量，而没有工作量的东西不符合我们传统的 勤劳，时刻忙碌 的价值观。

代码(DIY版)

先上代码，再讲故事。

先给出一个我自己的setjmp/longjmp的实现，我把它们改成了更好的两个名字，即 save/restore， 我觉得这比setjmp/longjmp更加直观。

我是用汇编写的save/restore：

# save_restore.s
# 暂且就先保存sp,bp,r12~r15这几个寄存器，不够可以再添。
# as --64 -o save_restore.o save_restore.s
.global save, restore
save:
	leaq 8(%rsp), %rdx
	movq %rdx, (0)(%rdi)
	movq %rbp, (8)(%rdi)
	# 下面这个(%rsp)取当前rsp处的值，它是谁？它是call save时压栈的save函数返回地址啊！这是关键！
	# 返回地址保存到参数的第三个8字节处，即rip字段！修改它即修改save从restore返回的地址。
	movq (%rsp), %rdx		
	movq %rdx, (16)(%rdi)
	movq %r12, (24)(%rdi)
	movq %r13, (32)(%rdi)
	movq %r14, (40)(%rdi)
	movq %r15, (48)(%rdi)
	movq $0, %rax	# 主动调用时，返回0，这个是跟setjmp学的。
	retq

restore:
	movq (0)(%rdi), %rsp
	movq (8)(%rdi), %rbp
	# 从参数第三个8字节处取出返回地址，压入到stack的rsp处，即栈顶，ret时将自动跳转到此处！
	movq (16)(%rdi), %rdx
	movq %rdx, (%rsp)
	movq (24)(%rdi), %r12
	movq (32)(%rdi), %r13
	movq (40)(%rdi), %r14
	movq (48)(%rdi), %r15
	movq $1, %rax	# 从restore返回时，返回1，这个写死了，就是1，不是别的。
	retq

我们来测试一下它的效果：

/* test_saverestore.c */
/* gcc -c test_saverestore.c -o test_saverestore.o */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct context {
	unsigned long rsp;
	unsigned long rbp;
	unsigned long rip;
	unsigned long r12;
	unsigned long r13;
	unsigned long r14;
	unsigned long r15;
};

struct context ctx;

int main()
{
	int ret;
	ret =save(&ctx);
	if (ret == 0) {
		printf("from setjmp\n");
		restore(&ctx);
	} else {
		printf("from longjmp\n");
	}
}

编译之：

[root@localhost ~]# as --64 -o save_restore.o save_restore.s
[root@localhost ~]# gcc -c test_saverestore.c -o test_saverestore.o
[root@localhost ~]# gcc -o a.out save_restore.o test_saverestore.o

看效果：

[root@localhost ~]# ./a.out
from setjmp
from longjmp

既然达到了效果，我就直接用这个写协程了。这可以是我自己从零到一用汇编写的，我想怎么改数据结构就怎么改，而这是用glic的setjmp/longjmp做不到的，至少是很难做到的。

我下面直接给出我的代码，故事在代码之后再细说。

/* uthread.c */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct context {
	unsigned long rsp;
	unsigned long rbp;
	unsigned long rip;
	unsigned long r12;
	unsigned long r13;
	unsigned long r14;
	unsigned long r15;
};

unsigned char *stack1, *stack2;
struct context *ctx1, *ctx2;

void schedule(struct context *prev, struct context *next)
{
	int ret;
	ret = save(prev);
	if (ret == 0) {
		restore(next);
	}
}

void func1()
{
	int i = 1;
	while (i++) {
		printf("thread 1 :%d\n", i);
		sleep(1);
		if (i%3 == 0) {
			schedule(ctx1, ctx2);
		}
	}
}

void func2()
{
	int i = 0xffff;
	while (i--) {
		printf("thread 2 :%d\n", i);
		sleep(1);
		if (i%3 == 0) {
			schedule(ctx2, ctx1);
		}
	}
}

int main(int unused1, char **unused2)
{
	int i, j, k;

	ctx1 = (struct context *)malloc(sizeof(struct context));
	ctx2 = (struct context *)malloc(sizeof(struct context));
	stack1 = (unsigned char *)malloc(4096);
	stack2 = (unsigned char *)malloc(4096);

	memset(ctx1, 0, sizeof(struct context));
	memset(ctx2, 0, sizeof(struct context));

	i =save(ctx1);
	j =save(ctx2);

	// 以下的4行是关键，用glibc的setjmp/longjmp很难做到！
	ctx1->rip = &func1;
	// 因为stack是向下生长的，所以要从高地址开始！这点很容易出错。
	ctx1->rsp = ctx1->rbp = stack1+4000;
	ctx2->rip = &func2;
	ctx2->rsp = ctx2->rbp = stack2+4000;

	// 切换到thread1的func1，内部切换了stack，并且由于修改了ctx的rip，即修改了save的返回地址，将直接进入func1的逻辑！
	k = restore(ctx1);
	
	return 0;
}

编译之：

[root@localhost ~]# as --64 -o save_restore.o save_restore.s
[root@localhost ~]# gcc -c uthread.c -o uthread.o
[root@localhost ~]# gcc -o a.out save_restore.o uthread.o

看效果：

[root@localhost ~]# ./a.out
thread 1 :2
thread 1 :3
thread 2 :65534
thread 2 :65533
thread 2 :65532
thread 1 :4
thread 1 :5
thread 1 :6
thread 2 :65531
thread 2 :65530
thread 2 :65529
thread 1 :7
thread 1 :8
thread 1 :9
thread 2 :65528
thread 2 :65527
thread 2 :65526
thread 1 :10
thread 1 :11
thread 1 :12
thread 2 :65525
thread 2 :65524
thread 2 :65523
thread 1 :13
thread 1 :14
thread 1 :15
thread 2 :65522
^C

嗯，成功了！就是这个效果。

故事

死活是对setjmp/longjmp耿耿于怀，如果不能用它实现一个相对优雅的用户态多线程，那么便是让人觉得遗憾，然而最终，我还是觉得这太复杂了，这个问题便没有继续跟下去。

问题的复杂来源于我对jmp_buf的误解！

我一开始觉得 jmp_buf 这个贯穿整个setjmp/longjmp逻辑的数据结构是一个非常简单的容器类结构体，里面保存着以寄存器名称命名的字段，比如我只需如下操作便可以修改其rsp指针：

jmp_buf *env = ...
env->rsp = 0x12345678;

然而事情并不是这么简单！

glibc的代码谁要是承认能看懂并且承认喜欢看两眼，我送他一条名牌领带?！glibc的代码太恶心了！但一般有文化的人还是喜欢用晦涩这个词…

我纳闷为什么很少有人分析jmp_buf这个结构体？或者说为什么很少有人分析glibc源代码本身。

我在前文中说过，setjmp/longjmp具有的局限性，即它只能 在共享堆栈的单执行流环境下从深往浅跳转！ 否则就会造成堆栈被破坏。

在单一的堆栈情况下，即单线程情况下，如果一个线程longjmp到一个已经返回的函数内部，那么可想而知当前的堆栈对于该函数已经完全没有意义，我们知道，堆栈是一个连续的内存空间，如果说一个函数已经返回了，那么该函数的栈帧将随之销毁，跳转导致其堆栈转到一个已经销毁的位置，那无异于指望楼阁悬停在半空。

单一的堆栈环境，这并不适合多线程，因为多个线程是并列平行的，每一个线程的堆栈必须是独立的。

setjmp/longjmp的局限并非仅仅在于 它没有提供设置堆栈的接口。 而是在于 setjmp/longjmp机制竟然没有公开通用的数据结构！ 以至于，我没有办法通过引用结构体字段的方式去修改该结构体某个字段的值！

没有接口我无所谓，我要能操作数据结构也行啊，问题是，这也不可能。

是的，我不知道 jmp_buf 它到底长什么样子。

但是，我理解它的原理，所以，我能自己写！

setjmp/longjmp的原理非常简单，它做的事情就两点：

保存当前的寄存器上下文到jmp_buf结构体。
通过函数调用堆栈顶部的返回地址更换来实现指令跳转。

在x86平台我们没有办法通过更改PC寄存器来实现指令跳转，我们要么用显式的jmp，要么用call指令返回地址的压栈，否则我们别无他法。

setjmp库函数，它做到了什么？它做到了保存当前寄存器上下文到一个叫做jmp_buf的数据结构，同时 setjmp它是一个函数调用， 它有能力获取它的 下一条指令 是什么，毕竟就是找栈顶嘛…同时，它有能力修改这个 下一条指令 以实现跳转，于是乎，longjmp函数实现上面我说的那个 修改下一条指令 以达到跳转的目标。

简直完美！

我知道这些之后，我主动写了一个程序，希望探究一下jmp_buf里到底都有什么。

也许你会说，直接看 setjmp.h 不就可以了么？是的，我也看了，我在我的系统里 /usr/include/setjmp.h 里看了，但是没有看懂啊！如果谁看懂了，教会我，我除了送名牌领带，还会送西裤，摩丝。

我是没有看懂，所以我选择自己折腾。

后来看了setjmp和longjmp的反汇编，我就知道了事情复杂的原因了。原来glibc把jmp_buf里面的内容给加密了！我靠！这太复杂了！

来自下面的地址，找到glibc的setjmp的代码：
https://code.woboq.org/userspace/glibc/sysdeps/x86_64/setjmp.S.html

ENTRY (__sigsetjmp)
        /* Save registers.  */
        movq %rbx, (JB_RBX*8)(%rdi)
#ifdef PTR_MANGLE
# ifdef __ILP32__
        /* Save the high bits of %rbp first, since PTR_MANGLE will
           only handle the low bits but we cannot presume %rbp is
           being used as a pointer and truncate it.  Here we write all
           of %rbp, but the low bits will be overwritten below.  */
        movq %rbp, (JB_RBP*8)(%rdi)
# endif
        mov %RBP_LP, %RAX_LP
        PTR_MANGLE (%RAX_LP)
        mov %RAX_LP, (JB_RBP*8)(%rdi)
#else
        movq %rbp, (JB_RBP*8)(%rdi)
#endif
        movq %r12, (JB_R12*8)(%rdi)
        movq %r13, (JB_R13*8)(%rdi)
        movq %r14, (JB_R14*8)(%rdi)
        movq %r15, (JB_R15*8)(%rdi)
        lea 8(%rsp), %RDX_LP        /* Save SP as it will be after we return.  */
#ifdef PTR_MANGLE
        PTR_MANGLE (%RDX_LP)
#endif
        movq %rdx, (JB_RSP*8)(%rdi)
        mov (%rsp), %RAX_LP        /* Save PC we are returning to now.  */
        LIBC_PROBE (setjmp, 3, LP_SIZE@%RDI_LP, -4@%esi, LP_SIZE@%RAX_LP)
#ifdef PTR_MANGLE
        PTR_MANGLE (%RAX_LP)
#endif
        movq %rax, (JB_PC*8)(%rdi)
#ifdef SHADOW_STACK_POINTER_OFFSET
# if IS_IN (libc) && defined SHARED && defined FEATURE_1_OFFSET
        /* Check if Shadow Stack is enabled.  */
        testl $X86_FEATURE_1_SHSTK, %fs:FEATURE_1_OFFSET
        jz L(skip_ssp)
# else
        xorl %eax, %eax
# endif
        /* Get the current Shadow-Stack-Pointer and save it.  */
        rdsspq %rax
        movq %rax, SHADOW_STACK_POINTER_OFFSET(%rdi)
# if IS_IN (libc) && defined SHARED && defined FEATURE_1_OFFSET
L(skip_ssp):
# endif
#endif
#if IS_IN (rtld)
        /* In ld.so we never save the signal mask.  */
        xorl %eax, %eax
        retq
#else
        /* Make a tail call to __sigjmp_save; it takes the same args.  */
        jmp __sigjmp_save
#endif
END (__sigsetjmp)

日了鬼了！顺便你必须看一下对应的longjmp的代码：
https://code.woboq.org/userspace/glibc/sysdeps/x86_64/__longjmp.S.html
现在注意到PTR_MANGLE这个宏，应该就算完成任务了，别的先不管！

虽然我们很难跟踪到jmp_buf的明确定义，但是无论如何，我想看看jmp_buf的内容，那就写程序将它dump出来看呗：

#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>

jmp_buf ctx;
unsigned char *buf;

int main()
{
	int i = 0;
	i =setjmp(&ctx);

	buf = (unsigned char *)&ctx;

	for (i = 0; i < sizeof(jmp_buf); i++) {
		printf(" %.2x", buf[i]);
	}
	printf("\n----end----\n");


}

运行如下：

[root@localhost ~]# ./a.out
 00 00 00 00 00 00 00 00 21 89 10 58 21 c4 f8 c8 c0 04 40 00 00 00 00 00 d0 c0 19 de ff 7f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 21 89 30 58 21 c4 f8 c8 21 89 7c 2c 92 78 07 37 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
----end----

我直接盯上了第6个或者第7个八字节，别问我怎么盯上的，直觉！

然后将它改掉，就直接 段错误 了！

我为什么不能随便改jmp_buf里的值，即便我知道第6个或者第7个八字节肯定是返回地址，我也不能改，为什么？

因为这里有个 指针加密 的特性！即glibc的 PTR_MANGLE ！dump出来的jmp_buf中的值其实是一个加过密的值，而不是原始值。

到底 PTR_MANGLE 是个什么？我也不知道，网上搜搜它的相关，它是 怕类似被压栈的函数返回地址被篡改从而导致指令流被劫持而引入的一种数据加密技术。 这么做的必要性在于，虽然处理器可以阻止PC寄存器被写，但是没有办法阻止内存中的栈被写，而像C语言函数调用约定中，函数的返回地址就是被压入到栈中的，只要改变了它，就能改变执行流！

关于 PTR_MANGLE 下面的文章是有益的：
https://udrepper.livejournal.com/13393.html
该文的最后一段：

Using encryption (instead of canaries) to protect structures like jmp_buf is at least as secure and in addition faster. Question is whether we can extend the use to other parts of the runtime. Runtimes for languages like C++ and Java just scream for such a protection, virtual function tables are a prime target.

说的就是setjmp/longjmp中使用PTR_MANGLE的情况，事实上，glibc就是这般保护jmp_buf中的PC寄存器字段的，该字段的值在longjmp中将用来替换longjmp被调用时压栈的返回地址。

现在让我们来看看glibc中关于 指针加密的宏 PTR_MANGLE 的伟大定义，来自：
https://code.woboq.org/userspace/glibc/sysdeps/unix/sysv/linux/x86_64/sysdep.h.html#405

#  define PTR_MANGLE(reg)        xor %fs:POINTER_GUARD, reg;          \
                                rol $2*LP_SIZE+1, reg

因此上述程序dump出来的jmp_buf中，其中的 21 89 30 58 21 c4 f8 c8 以及 21 89 7c 2c 92 78 07 37 其实是加密过的指针值，而并不是一个原始的指向函数返回后下一条指令指针。

如果想通过修改PTR_MANGLE过的数据来达到替换返回地址的目的，你就必须将要替换的地址用 PTR_MANGLE 来搞一下才行。

好吧，那本文的最后，按照前面在glibc中撸出来的PTR_MANGLE的定义，整一个呗。

代码(原生setjmp/longjmp版)

我先将这个setjmp中的PTR_MANGLE和对应longjmp中对应的PTR_DEMANGLE抽出来，写成两个函数：

// 别问我代码里那些0x11，0x30魔术字怎么来的，撸代码和gdb撸出来的！
unsigned long PTR_MANGLE(unsigned long var)
{
    asm (   "movq %1, %%rdx \n"
            "xor    %%fs:0x30, %%rdx\n"
            "rol    $0x11,%%rdx\n"
            "movq %%rdx, %0\t\n"
            : "=r" (var)
            :"0" (var));
    return var;
}

unsigned long PTR_DEMANGLE(unsigned long var)
{
    asm (   "ror $0x11, %0\n"
            "xor %%fs:0x30, %0"
            : "=r" (var)
            : "0" (var));
    return var;
}

接下来的事情很简单，我们看glibc中setjmp的定义，知道jmp_buf的第7个八字节表示函数的返回地址：

#define JB_PC        7

那么就先在jmp_buf里定位第7个八字节，然后将其替换成通过PTR_MANGLE加密的另一个函数的地址就可以实现跳转了：

#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>

#define JB_PC        7

jmp_buf ctx;

void func()
{
	printf("浙江温州皮鞋?湿\n");
	exit(123);
}

int main()
{
	int i = 0;
	unsigned long *prip, rip, enc_rip;
	i =setjmp(&ctx);

	prip = ((unsigned long *)&ctx) + JB_PC;
	rip = &func;
	enc_rip = PTR_MANGLE(rip);
	*prip = enc_rip;

	longjmp(&ctx, 2);
	return 0;
}

想想会怎样？在longjmp中会跳转到我们的func函数里面打印 “浙江温州皮鞋?湿” 吗？试试看：

[root@localhost ~]# ./a.out
浙江温州皮鞋?湿
[root@localhost ~]# echo $?
123

是的，下雨进水不会胖！

接下来，我们可以按照这种方式修改jmp_buf的任意 被加密的寄存器地址了 。我给出本文最初我自己DIY的save/restore例子的标准setjmp/longjmp版本：

// 基于标准的setjmp/longjmp实现！
// 然而我不知道如何才能直接用 PTR_MANGLE 这个宏，所以我使用自己实现内联汇编版本！
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>

unsigned char *stack1, *stack2;
jmp_buf ctx1, ctx2;

unsigned long PTR_MANGLE(unsigned long var)
{
	asm (	"movq %1, %%rdx \n"
			"xor    %%fs:0x30, %%rdx\n"
			"rol    $0x11,%%rdx\n"
			"movq %%rdx, %0\t\n"
			: "=r" (var)
			:"0" (var));
	return var;
}

unsigned long PTR_DEMANGLE(unsigned long var)
{
	asm (	"ror $0x11, %0\n"
			"xor %%fs:0x30, %0"
			: "=r" (var)
			: "0" (var));
	return var;
}

void schedule(jmp_buf *prev, jmp_buf *next)
{
	int ret;
	ret = setjmp(*prev);
	if (ret == 0) {
		longjmp(*next, 2);
	}
}

void func1()
{
	int i = 1;
	while (i++) {
		printf("thread 1 :%d\n", i);
		sleep(1);
		if (i%3 == 0) {
			schedule(&ctx1, &ctx2);
		}
	}
}

void func2()
{
	int i = 0xffff;
	while (i--) {
		printf("thread 2 :%d\n", i);
		sleep(1);
		if (i%3 == 0) {
			schedule(&ctx2, &ctx1);
		}
	}
}

/*
#define JB_RBX        0
#define JB_R12        2
#define JB_R13        3
#define JB_R14        4
#define JB_R15        5
*/
#define JB_RBP        1
#define JB_RSP        6
#define JB_PC        7

int main()
{
	int i, j;
	unsigned long *prip1, *prip2;
	unsigned long *pst1, *pst2, *pbp1, *pbp2;


	stack1 = (unsigned char *)malloc(4096);
	stack2 = (unsigned char *)malloc(4096);

	memset(&ctx1, 0, sizeof(jmp_buf));
	memset(&ctx2, 0, sizeof(jmp_buf));

	i =setjmp(ctx1);
	j =setjmp(ctx2);

	prip1 = ((unsigned long *)&ctx1) + JB_PC;
	prip2 = ((unsigned long *)&ctx2) + JB_PC;
	pst1 = ((unsigned long *)&ctx1) + JB_RSP;
	pst2 = ((unsigned long *)&ctx2) + JB_RSP;
	pbp1 = ((unsigned long *)&ctx1) + JB_RBP;
	pbp2 = ((unsigned long *)&ctx2) + JB_RBP;

	// 加密需要保护的指针值。
	*prip1 = PTR_MANGLE(func1);
	*pst1 = *pbp1 = PTR_MANGLE(stack1+4000);
	*prip2 = PTR_MANGLE(func2);
	*pst2 = *pbp2 = PTR_MANGLE(stack2+4000);

	longjmp(ctx1, 2);

}

看看效果：

[root@localhost ~]# ./a.out
thread 1 :2
thread 1 :3
thread 2 :65534
thread 2 :65533
thread 2 :65532
thread 1 :4
thread 1 :5
thread 1 :6
thread 2 :65531
thread 2 :65530
thread 2 :65529
thread 1 :7
thread 1 :8
thread 1 :9
thread 2 :65528
thread 2 :65527
thread 2 :65526
^C

和本文开头前后呼应，不再多说。

首先自己写了一个简易的setjmp/longjmp的模仿版，实现了协程，然后最终搞定了原生的setjmp/longjmp。很low，但是不错！

浙江温州皮鞋湿，下雨进水不会胖！