带你学习《深入理解计算机系统》链接（1）——链接过程与变量重定位

最新推荐文章于 2024-06-25 01:15:39 发布

coreyspomu

最新推荐文章于 2024-06-25 01:15:39 发布

阅读量1.6k

点赞数

分类专栏：链接原理文章标签：链接汇编

本文链接：https://blog.csdn.net/u013471946/article/details/44224309

版权

链接原理专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本文探讨了链接过程和变量重定位的概念，通过《深入理解计算机系统》中的例子，详细解释了从预处理到汇编再到链接的步骤。在链接阶段，分析了为何使用ld时会出现错误，以及如何通过gcc解决。同时，文章还分析了汇编代码中的变量表示，特别是全局和静态变量的处理，以及重定位文件中的符号规则，包括强符号和弱符号的处理策略。

摘要由CSDN通过智能技术生成

据教材的意思，历年历代的计算机系统文献都没有很好的讲述链接。因为：

1、链接处在编译器、计算机体系结构和操作系统交叉点上，它要求理解代码生成、机器语言编程、程序实例化和虚拟存储器；

2、链接恰好不落在某个通常的计算机系统专业中，因此这些专业领域的经典文献无一例外没有很好的描述它；

3、Levine的《Linkers and Loaders》是本关于链接的不错的书籍，不过我暂没有看的打算o(╯□╰)o

一、链接的各个过程以及相关文件

先完整的给出我们的测试代码，都很简单

独立swap.c

独立addvec_o.c

/* swap.c */

extern int buf[];

int *bufp0 = &buf[0];
int *bufp1;
int a;

void swap()
{
int temp;
static int cao;
bufp1 = &buf[1];
temp = *bufp0;
*bufp0 = *bufp1;
*bufp1 = temp;
}

/* addvec_o.c */

void addvec_o(int *x, int *y,
int *z, int n)
{
int i;

for (i = 0; i < n; i++)
z[i] = x[i] + y[i];
}

用来生成静态库libvector.a或者动态库libvector.so的文件：

链接库函数multvec.c

链接库函数addvec.c

/* multvec.c */

void multvec(int *x, int *y,
int *z, int n)
{
int i;

for (i = 0; i < n; i++)
z[i] = x[i] * y[i];
}

/* addvec.c */

void addvec_o(int *x, int *y,
int *z, int n)
{
int i;

for (i = 0; i < n; i++)
z[i] = x[i] + y[i];
}

//main2.c:

#include <stdio.h>
#include "vector.h"

extern void swap();
extern void addvec_o(int *x, int *y, int *z, int n);

int buf[2] = {1, 2};

void (*fp)(void) = swap;

int x[2] = {1, 2};
int y[2] = {3, 4};
int z[2];

int *p = z;

int main()
{
    static int a;
    static int b = 5;
    const int c;
    const int d = 6;
    a = 8;
    b = 9;
    p = y;
    swap();
    addvec_o(x, y, z, 2);
    addvec(y, x, z, 2);
    printf("z = [%d %d], c = %d, d = %d\n", z[0], z[1], c, d);
    return 0;
}

一堆源码要生成可执行文件，中间至少要经历四个步骤

1、通过预处理器cpp将.c文件翻译成ASCII码中间文件.i，比如cpp main2.c main2.i，我进去看了看，代码部分似乎没变，但是注释全被消除了；

2、通过编译器cc，将main2.i翻译成ASCII码汇编文件.s,比如cc-O2 -S main2.i -o main2.s，生成了标准的汇编代码；

3、通过汇编器as将main2.s翻译成可重定位目标文件.o，比如as main2.s -o main2.o，生成我们常见的目标文件；

4、最后通过链接器ld创建最终可执行文件p：ld -o p main2.o swap.o addvec_o.o ./libvector.so，不过运行报：ld: warning: cannot find entry symbol _start; defaulting to 08048298，

运行./p，报错：/usr/lib/libc.so.1: bad ELF interpreter。

为什么第四步执行会报警，并且生成的p无法执行会报错呢？我用gcc代替ld进行连接，能正确生成p并执行打印结果z = [4 6]，说明前三步没有问题，问题就出在链接ld，我用gcc -v 代替ld再次运行发现，得到以下信息：

gcc version 3.4.6 20060404 (Red Hat 3.4.6-3)
/usr/libexec/gcc/i386-redhat-linux/3.4.6/collect2 --eh-frame-hdr -m elf_i386 -dynamic-linker /lib/ld-linux.so.2 -o p /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crt1.o /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crti.o /usr/lib/gcc/i386-redhat-linux/3.4.6/crtbegin.o -L/usr/lib/gcc/i386-redhat-linux/3.4.6 -L/usr/lib/gcc/i386-redhat-linux/3.4.6 -L/usr/lib/gcc/i386-redhat-linux/3.4.6/../../.. main2.o swap.o addvec_o.o ./libvector.so -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/i386-redhat-linux/3.4.6/crtend.o /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crtn.o

我们可以看到，/lib/ld-linux.so.2就是执行链接具体的命令，而在main2.o swap.o addvec_o.o ./libvector.so 的前后都有很多.o的文件被链接，那么我们移花接木以下：

ld -o p /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crt1.o /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crti.o /usr/lib/gcc/i386-redhat-linux/3.4.6/crtbegin.o -L/usr/lib/gcc/i386-redhat-linux/3.4.6 -L/usr/lib/gcc/i386-redhat-linux/3.4.6 -L/usr/lib/gcc/i386-redhat-linux/3.4.6/../../.. main2.o swap.o addvec_o.o ./libvector.so -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/i386-redhat-linux/3.4.6/crtend.o /usr/lib/gcc/i386-redhat-linux/3.4.6/../../../crtn.o

再次运行，没有报错，生成p文件，运行p报错：/usr/lib/libc.so.1: bad ELF interpreter，好吧，我就把/usr/lib/libc.so在同一个地方复制一个备份文件/usr/lib/libc.so.1，并且把权限设置成完全可读，却报错：./p: 正在访问一个已毁坏的共享库……好吧，我暂且投降，看来要在redhat里手动链接生成一个可执行文件，设计到的链接信息相对复杂，已超出我的认知范畴，暂且无奈的用gcc代替ld帮我做链接，反正我只讨论原理性问题！相信熟悉gcc的

事实上，shell调用加载器loader函数，由它拷贝可执行文件p中的代码和数据到存储器，然后将控制转移到这个程序的开头，链接工作就是为加载器做准备的。

二、变量重定位分析

那么我们先来看下main2.s文件：

首先是它的数据区：

main2.s汇编区

        .file   "main2.c"
.globl p //int *p = z;
        .data
        .align 4
        .type   p, @object
        .size   p, 4
p:
        .long   z
.globl y //int y[2] = {3, 4};
        .align 4
        .type   y, @object
        .size   y, 8
y:
        .long   3
        .long   4
.globl x //int x[2] = {1, 2};
        .align 4
        .type   x, @object
        .size   x, 8
x:
        .long   1
        .long   2
.globl fp //void (*fp)(void) = swap;
        .align 4
        .type   fp, @object
        .size   fp, 4
fp:
        .long   swap
.globl buf //int buf[2] = {1, 2};
        .align 4
        .type   buf, @object
        .size   buf, 8
buf:
        .long   1
        .long   2
        .local a.0 static int a;
        .comm   a.0,4,4
        .align 4
        .type   b.1, @object
        .size   b.1, 4
b.1: //static int b = 5;
        .long   5
        .section        .rodata.str1.1,"aMS",@progbits,1
.LC0:
        .string "z = [%d %d]\n" //printf("z = [%d %d],……
        .text
        .p2align 2,,3
.globl main
        .type   main, @function

能明显看出，但凡是被初始化的全局变量或静态变量，在.s中都有专门的描述，而对于没有被初始化的a和z，都已附加的形式存在。而main中的局部变量则完全没有提及

       main:
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ebx
        pushl   %eax
        andl    $-16, %esp //可执行文件栈初始化统一位置
        subl    $16, %esp //main自带两个参数加两个const局部变量，总共4个sizeof(int)空间
        movl    $8, a.0
        movl    $9, b.1
        movl    $y, p //p = y
        call    swap
        pushl   $2 //addvec_o参数压栈
        pushl   $z
        pushl   $y
        pushl   $x
        call    addvec_o
        pushl   $2 //addvec参数压栈
        pushl   $z
        pushl   $y
        pushl   $x
        call    addvec
        addl    $20, %esp //printf函数空间准备
        pushl   $6 //printf可变参数压栈
        pushl   %ebx
        pushl   z+4 //数组访问
        pushl   z //数组访问
        pushl   $.LC0 //printf""内字符串
        call    printf
        xorl    %eax, %eax //return 0
        movl    -4(%ebp), %ebx
        leave
        ret

在main函数中，先前的全局和静态变量在汇编文件中几乎都是以明文的形式进行操作，特别关注$y, p，居然把y当成立即数赋值给指针p……太直白了吧也……接下来的pushl，但凡涉及到数组首地址的都是立即数修饰明文……然后函数跳转call接的也是明文……再来看调用printf函数之前压入的参数，pushl $6？咋那么眼熟呢？原来是在传int const d！人家直接把变量都给省了，拎出6来直接算……

我们再来看下汇编文件swap.s：

        .file   "swap.c"
.globl bufp0
        .data
        .align 4
        .type   bufp0, @object
        .size   bufp0, 4
bufp0:
        .long   buf
        .text
.globl swap
        .type   swap, @function
swap:
        pushl   %ebp
        movl    %esp, %ebp
        movl    $buf+4, bufp1
        movl    bufp0, %edx
        movl    (%edx), %ecx
        movl    buf+4, %eax
        movl    %eax, (%edx)
        movl    bufp1, %eax
        movl    %ecx, (%eax)
        leave
        ret

我们看到，只有bufp0这样被初始化过的全局变量有所描述，外部变量buf在bufp0的赋值中出现，而bufp1在swap函数中也是明文显示，却没有任何定义。

接下来我们来看看可重定位文件main2.o的反汇编版本：

00000000 <main>:
   0:   55                                     push   %ebp
   1:   89 e5 mov    %esp,%ebp
   3:   53                                     push   %ebx
   4:   50                                   push   %eax
   5:   83 e4 f0 and    $0xfffffff0,%esp
   8:   83 ec 10 sub    $0x10,%esp
   b:   c7 05 00 00 00 00 08 movl   $0x8,0x0
12:   00 00 00
15:   c7 05 20 00 00 00 09 movl   $0x9,0x20
1c:   00 00 00
1f:   c7 05 00 00 00 00 00 movl   $0x0,0x0
26:   00 00 00
29:   e8 fc ff ff ff call   2a <main+0x2a>
2e:   6a 02 push   $0x2
30:   68 00 00 00 00 push   $0x0
35:   68 00 00 00 00 push   $0x0
3a:   68 00 00 00 00 push   $0x0
3f:   e8 fc ff ff ff                       call   40 <main+0x40>
44:   6a 02 push   $0x2
46:   68 00 00 00 00 push   $0x0
4b:   68 00 00 00 00 push   $0x0
50:   68 00 00 00 00 push   $0x0
55:   e8 fc ff ff ff call   56 <main+0x56>
5a:   83 c4 14 add    $0x14,%esp
5d:   6a 06 push   $0x6
5f:   53 push   %ebx
60:   ff 35 04 00 00 00 pushl 0x4
66:   ff 35 00 00 00 00 pushl 0x0
6c:   68 00 00 00 00 push   $0x0
71:   e8 fc ff ff ff call   72 <main+0x72>
76:   31 c0 xor    %eax,%eax
78:   8b 5d fc mov    0xfffffffc(%ebp),%ebx
7b:   c9 leave
7c:   c3 ret

反汇编 .data 节：

00000000 <p>:
0: 00 00 add %al,(%eax)
...

00000004 <y>:
   4:   03 00                   add    (%eax),%eax
   6:   00 00                   add    %al,(%eax)
   8:   04 00                   add    $0x0,%al
        ...

0000000c <x>:
   c:   01 00                   add    %eax,(%eax)
   e:   00 00                   add    %al,(%eax)
10:   02 00                  add    (%eax),%al
        ...

00000014 <fp>:
14: 00 00 add %al,(%eax)
...

00000018 <buf>:
18:   01 00                   add    %eax,(%eax)
1a:   00 00                   add    %al,(%eax)
1c:   02 00                   add    (%eax),%al
        ...

00000020 <b.1>:
20:   05 .byte 0x5
21:   00 00                   add    %al,(%eax)
        ...
反汇编 .bss 节：

00000000 <a.0>:
0: 00 00 add %al,(%eax)
...

我们这暂时只涉及.test、.data以及.bss节，先看.data段，我们发现，标号00~20，分别对应留个已初始化的全局变量和静态变量，而未被初始化的静态变量a处在.bss节，标号仍然是0。

最令人奇怪的是，标号貌似只是说明该变量在文件中所处的位置，而实际上在.o中，他们都以$0x0来指代，可以看出在调用addvec两个类型函数时，x\y\y三个参数都是$0x0压栈进的函数！链接器是如何识别不同位置调用的函数区别的呢？当初写文章时没完全搞清楚，只在.o中追踪到了区分依据的位置，可以参考我曾经发的求助贴：http://bbs.csdn.net/topics/391003896 ，最终被一位自称菜鸟但观察力惊人的网友解答了，有兴趣的盆友可以去看看:)

好了，接下来看.data节的其他部分。我们看到y、x的初始值就是C代码中的3、4和1、2，只不过在.o文件中，他们都占据了4字节，因此中间才会间隔两字节的00 00,。buf的赋值类似，静态变量b的赋值也是四字节，但是b.1是什么意思呢？其实猜都能猜到，这是为了区分可能出现的多重定义。事实上，C语言工程中经常容易出现变量重名，而编译器对于重名现象并不是绝对排斥，而是有选择性的制定某个变量为有效变量。

在编译时，编译器向汇编器输出变量符号，有“强”和“弱”的区别：函数和已初始化的全局变量是强符号，未初始化的全局变量是弱符号，这里的全局变量包含了局部静态变量。Unix链接器用以下规则处理多重定义符号：

规则1：不允许多个强符号。

规则2：强弱符号并存的情况下，选择强符号。

规则3：多个弱符号并存的情况下，任选一个。

有了这三个规则，看起来貌似解决得完美，但实际上，由于后两个规则的存在，当出现已知或未知的多重定义，你并不能确定在哪段代码那个变量实际生效的，可能会引发一些事故，类似例子可以在网上搜到这里就不提了。这里例子中，b是已初始化全局变量，因此此时他就是强符号，如果在出现b.2或者b.3，估计就是出现了弱符号。

好了，最后一步由链接器将.o文件生成可执行文件，gcc -o p main2.o swap.o addvec_o.o ./libvector.so

0804848c <main>:
804848c:       55                                         push   %ebp
804848d:       89 e5                                   mov    %esp,%ebp
804848f:       53                                         push   %ebx
8048490:       50                                        push   %eax
8048491:       83 e4 f0 and    $0xfffffff0,%esp
8048494:       83 ec 10 sub    $0x10,%esp
8048497:       c7 05 d4 97 04 08 08 movl   $0x8,0x80497d4
804849e:       00 00 00
80484a1:       c7 05 c8 97 04 08 09 movl   $0x9,0x80497c8
80484a8:       00 00 00
80484ab:       c7 05 a8 97 04 08 ac movl   $0x80497ac,0x80497a8
80484b2:       97 04 08
80484b5:       e8 5a 00 00 00 call   8048514 <swap>
80484ba:       6a 02 push   $0x2
80484bc:       68 d8 97 04 08 push   $0x80497d8
80484c1:       68 ac 97 04 08 push   $0x80497ac
80484c6:       68 b4 97 04 08 push   $0x80497b4
80484cb:       e8 70 00 00 00 call   8048540 <addvec_o>
80484d0:       6a 02 push   $0x2
80484d2:       68 d8 97 04 08 push   $0x80497d8
80484d7:       68 b4 97 04 08 push   $0x80497b4
80484dc:       68 ac 97 04 08 push   $0x80497ac
80484e1:       e8 ce fe ff ff call   80483b4 <addvec@plt>
80484e6:       83 c4 14 add    $0x14,%esp
80484e9:       6a 06 push   $0x6
80484eb:       53 push   %ebx
80484ec:       ff 35 dc 97 04 08 pushl 0x80497dc
80484f2:       ff 35 d8 97 04 08 pushl 0x80497d8
80484f8:       68 78 86 04 08 push   $0x8048678
80484fd:       e8 d2 fe ff ff call   80483d4 <printf@plt>
8048502:       31 c0 xor    %eax,%eax
8048504:       8b 5d fc mov    0xfffffffc(%ebp),%ebx
8048507:       c9                      leave
8048508:       c3                      ret
8048509:       90                      nop
804850a:       90                      nop
804850b:       90                      nop

……

08048514 <swap>:
8048514:       55 push   %ebp
8048515:       89 e5 mov    %esp,%ebp
8048517:       e8 f0 ff ff ff call   804850c <shit>
804851c:       a1 cc 97 04 08 mov    0x80497cc,%eax
8048521:       8b 15 c4 97 04 08 mov    0x80497c4,%edx
8048527:       8b 08   mov    (%eax),%ecx
8048529:       89 10 mov    %edx,(%eax)
804852b:       c7 05 e0 97 04 08 c4    movl   $0x80497c4,0x80497e0
8048532:       97 04 08
8048535:       89 0d c4 97 04 08 mov    %ecx,0x80497c4
804853b:       c9                      leave
804853c:       c3                      ret
804853d:       90                      nop
804853e:       90                      nop
804853f:       90                      nop

08048540 <addvec_o>:
8048540:       55 push   %ebp
8048541:       89 e5                   mov    %esp,%ebp
8048543:       57 push   %edi
8048544:       56 push   %esi
8048545:       53 push   %ebx

……

080497a4 <p.0>:
80497a4: a8 96 test $0x96,%al
80497a6: 04 08 add $0x8,%al

080497a8 <p>:
80497a8: d8 97 04 08 03 00 fcoms 0x30804(%edi)

080497ac <y>:
80497ac:       03 00                   add    (%eax),%eax
80497ae:       00 00                   add    %al,(%eax)
80497b0:       05 00 00 00 01    add    $0x1000000,%eax

080497b4 <x>:
80497b4:       01 00                   add    %eax,(%eax)
80497b6:       00 00                   add    %al,(%eax)
80497b8:       02 00                   add    (%eax),%al
        ...

080497bc <fp>:
80497bc: 14 85 adc $0x85,%al
80497be: 04 08 add $0x8,%al

080497c0 <buf>:
80497c0:       01 00                   add    %eax,(%eax)
80497c2:       00 00                   add    %al,(%eax)
80497c4:       02 00                   add    (%eax),%al
        ...

080497c8 <b.1>:
80497c8: 05 00 00 00 c0 add $0xc0000000,%eax

080497cc <bufp0>:
80497cc:       c0 .byte 0xc0
80497cd:       97 xchg   %eax,%edi
80497ce:       04 08                   add    $0x8,%al
反汇编 .bss 节：

080497d0 <completed.1>:
80497d0: 00 00 add %al,(%eax)
...

080497d4 <a.0>:
80497d4: 00 00 add %al,(%eax)

...

080497d8 <z>:
...

080497e0 <bufp1>:
80497e0: 00 00 add %al,(%eax)

printf调用先不看，可以明显发现，函数调用，全局变量的赋值，都已经给出了绝对的虚拟地址值，没少好说的。而关于静态库和动态库的生成，也因为太死板不再涉及，那么在下一节，将涉及动态库调用的重定位。