从Fishhook到Macho文件格式(一)

开始文章之前, 按照惯例先把Fishhook文档中的图片放在最前面。


其中的四个表格值得我们特别注意:

  1. Lazy Symbol Pointer Table (延迟符号指针表)
  2. Indirect Symbol Table (间接符号表)
  3. Symbol Table(符号表)
  4. String Table  (字符表)

struct rebindings_entry {  
    struct rebinding *rebindings;  
    size_t rebindings_nel;  
    struct rebindings_entry *next;};
static struct rebindings_entry *_rebindings_head;

static int prepend_rebindings(struct rebindings_entry **rebindings_head, 
                              struct rebinding rebindings[],  
                              size_t nel) {  
    struct rebindings_entry *new_entry = (struct rebindings_entry *) malloc(sizeof(struct rebindings_entry));  
    if (!new_entry) {    
        return -1;  
    }  
    new_entry->rebindings = (struct rebinding *) malloc(sizeof(struct rebinding) * nel);  
    if (!new_entry->rebindings) {    
        free(new_entry);    
        return -1;  
    }  
    memcpy(new_entry->rebindings, rebindings, sizeof(struct rebinding) * nel);  
    new_entry->rebindings_nel = nel;  
    new_entry->next = *rebindings_head;  
    *rebindings_head = new_entry;  
    return 0;
}
复制代码
先介绍最重要的一个库 dlfcn:
 int dladdr(const void* addr, Dl_info* info);复制代码
        The dladdr() function queries dyld (the dynamic linker) for information
     about the image containing the address addr.  The information is returned
     in the structure specified by info.  The structure contains at least the
     following members:

     const char* dli_fname     The pathname of the shared object containing
                               the address.

     void* dli_fbase           The base address (mach_header) at which the
                               image is mapped into the address space of the
                               calling process.

     const char* dli_sname     The name of the nearest run-time symbol with a
                               value less than or equal to addr.

     void* dli_saddr           The value of the symbol returned in dli_sname.

     The dladdr() function is available only in dynamically linked programs.复制代码

 const struct mach_header* _dyld_get_image_header(uint32_t image_index)  
                returns a pointer to the mach header of the image indexed by image_index.  
                If image_index is out of range, NULL is returned.复制代码
 intptr_t _dyld_get_image_vmaddr_slide(uint32_t image_index)
                returns the virtual memory address slide amount of the image 
                indexed by image_index. If image_index is out of range zero is returned.复制代码


/*
 * The mach header appears at the very beginning of the object file; it
 * is the same for both 32-bit and 64-bit architectures.
 */
struct mach_header {
	uint32_t	magic;		/* mach magic number identifier */       /*魔数*/
	cpu_type_t	cputype;	/* cpu specifier */                      /*CPU类型*/
	cpu_subtype_t	cpusubtype;	/* machine specifier */                  /*机器类型*/
	uint32_t	filetype;	/* type of file */                       /*文件类型*/
	uint32_t	ncmds;		/* number of load commands */            /*加载指令数*/
	uint32_t	sizeofcmds;	/* the size of all the load commands */  /*所有指令的大小*/
	uint32_t	flags;		/* flags */                              /*标志位*/
};复制代码
struct mach_header_64 {
	uint32_t	magic;		/* mach magic number identifier */
	cpu_type_t	cputype;	/* cpu specifier */
	cpu_subtype_t	cpusubtype;	/* machine specifier */
	uint32_t	filetype;	/* type of file */
	uint32_t	ncmds;		/* number of load commands */
	uint32_t	sizeofcmds;	/* the size of all the load commands */
	uint32_t	flags;		/* flags */
	uint32_t	reserved;	/* reserved */
};复制代码


/*
 * The load commands directly follow the mach_header.  The total size of all
 * of the commands is given by the sizeofcmds field in the mach_header.  All
 * load commands must have as their first two fields cmd and cmdsize.  The cmd
 * field is filled in with a constant for that command type.  Each command type
 * has a structure specifically for it.  The cmdsize field is the size in bytes
 * of the particular load command structure plus anything that follows it that
 * is a part of the load command (i.e. section structures, strings, etc.).  To
 * advance to the next load command the cmdsize can be added to the offset or
 * pointer of the current load command.  The cmdsize for 32-bit architectures
 * MUST be a multiple of 4 bytes and for 64-bit architectures MUST be a multiple
 * of 8 bytes (these are forever the maximum alignment of any load commands).
 * sizeof(long) (this is forever the maximum alignment of any load commands).
 * The padded bytes must be zero.  All tables in the object file must also
 * follow these rules so the file can be memory mapped.  Otherwise the pointers
 * to these tables will not work well or at all on some machines.  With all
 * padding zeroed like objects will compare byte for byte.
 */
/*
 load命令直接跟在mach_header之后。所有的指令们的总大小命令由mach_header中的sizeofcmds字段给出。
 所有的load命令必须有前两个字段cmd和cmdsize。 cmd字段用是表示该命令类型的常量值。每种命令类型都有
 它门对应的结构。 cmdsize字段是以字节为单位的大小特定的加载命令结构加上它后面的任何内容是加载命令的一部分(即部分结构,字符串等)。
 前进到下一个加载命令,可以将cmdsize添加到偏移量或当前加载命令的指针。
 适用于32位体系结构的cmdsize必须是4字节的倍数,对于64位架构必须是倍数8个字节(这些永远是任何加载命令的最大对齐)。
 sizeof(long)(这是任何加载命令的最大对齐)。填充字节必须为零。目标文件中的所有表也必须遵循这些规则,以便文件可以进行内存映射。
 否则指针这些表格在某些机器上无法正常工作或根本无法正常工作。所有padding归零像对象将比较逐字节。
*/

struct load_command {
	unsigned long cmd;		/* type of load command */             /*加载指令类型*/  
	unsigned long cmdsize;		/* total size of command in bytes */   /*加载指令大小*/
};复制代码


/* Constants for the cmd field of all load commands, the type */
#define	LC_SEGMENT	0x1	/* segment of this file to be mapped */
#define	LC_SYMTAB	0x2	/* link-edit stab symbol table info */
#define	LC_SYMSEG	0x3	/* link-edit gdb symbol table info (obsolete) */
#define	LC_THREAD	0x4	/* thread */
#define	LC_UNIXTHREAD	0x5	/* unix thread (includes a stack) */
#define	LC_LOADFVMLIB	0x6	/* load a specified fixed VM shared library */
#define	LC_IDFVMLIB	0x7	/* fixed VM shared library identification */
#define	LC_IDENT	0x8	/* object identification info (obsolete) */
#define LC_FVMFILE	0x9	/* fixed VM file inclusion (internal use) */
#define LC_PREPAGE      0xa     /* prepage command (internal use) */
#define	LC_DYSYMTAB	0xb	/* dynamic link-edit symbol table info */
#define	LC_LOAD_DYLIB	0xc	/* load a dynamicly linked shared library */
#define	LC_ID_DYLIB	0xd	/* dynamicly linked shared lib identification */
#define LC_LOAD_DYLINKER 0xe	/* load a dynamic linker */
#define LC_ID_DYLINKER	0xf	/* dynamic linker identification */
#define	LC_PREBOUND_DYLIB 0x10	/* modules prebound for a dynamicly */
				/*  linked shared library */复制代码


/*
 * The segment load command indicates that a part of this file is to be
 * mapped into the task's address space.  The size of this segment in memory,
 * vmsize, maybe equal to or larger than the amount to map from this file,
 * filesize.  The file is mapped starting at fileoff to the beginning of
 * the segment in memory, vmaddr.  The rest of the memory of the segment,
 * if any, is allocated zero fill on demand.  The segment's maximum virtual
 * memory protection and initial virtual memory protection are specified
 * by the maxprot and initprot fields.  If the segment has sections then the
 * section structures directly follow the segment command and their size is
 * reflected in cmdsize.
 */
struct segment_command {	        /* for 32-bit architectures */
	unsigned long	cmd;		/* LC_SEGMENT */
	unsigned long	cmdsize;	/* includes sizeof section structs */
	char		segname[16];	/* segment name */                    /*段名 __TEXT, __DATA, __LINKEDIT*/
	unsigned long	vmaddr;		/* memory address of this segment */  /*段虚拟地址*/
	unsigned long	vmsize;		/* memory size of this segment */     /*段大小*/
	unsigned long	fileoff;	/* file offset of this segment */     /**/
	unsigned long	filesize;	/* amount to map from the file */
	vm_prot_t	maxprot;	/* maximum VM protection */
	vm_prot_t	initprot;	/* initial VM protection */
	unsigned long	nsects;		/* number of sections in segment */   /*段的节数*/
	unsigned long	flags;		/* flags */                           /*段的标识位*/       
};

/*
 * The 64-bit segment load command indicates that a part of this file is to be
 * mapped into a 64-bit task's address space.  If the 64-bit segment has
 * sections then section_64 structures directly follow the 64-bit segment
 * command and their size is reflected in cmdsize.
 */
struct segment_command_64 {	/* for 64-bit architectures */
	uint32_t	cmd;		/* LC_SEGMENT_64 */
	uint32_t	cmdsize;	/* includes sizeof section_64 structs */
	char		segname[16];	/* segment name */
	uint64_t	vmaddr;		/* memory address of this segment */
	uint64_t	vmsize;		/* memory size of this segment */
	uint64_t	fileoff;	/* file offset of this segment */
	uint64_t	filesize;	/* amount to map from the file */
	vm_prot_t	maxprot;	/* maximum VM protection */
	vm_prot_t	initprot;	/* initial VM protection */
	uint32_t	nsects;		/* number of sections in segment */
	uint32_t	flags;		/* flags */
};
复制代码


struct symtab_command {
	unsigned long	cmd;		/* LC_SYMTAB */
	unsigned long	cmdsize;	/* sizeof(struct symtab_command) */ 
	unsigned long	symoff;		/* symbol table offset */              /*符号表偏移量*/
	unsigned long	nsyms;		/* number of symbol table entries */   /*符号表元素个数*/	
        unsigned long	stroff;		/* string table offset */              /*字符表偏移量*/	
        unsigned long	strsize;	/* string table size in bytes */       /*字符表中总共的字符的大小*/
};

复制代码


struct dysymtab_command {
    unsigned long cmd;		/* LC_DYSYMTAB */
    unsigned long cmdsize;	/* sizeof(struct dysymtab_command) */

    /*
     * The symbols indicated by symoff and nsyms of the LC_SYMTAB load command
     * are grouped into the following three groups:
     *    local symbols (further grouped by the module they are from)
     *    defined external symbols (further grouped by the module they are from)
     *    undefined symbols
     *
     * The local symbols are used only for debugging.  The dynamic binding
     * process may have to use them to indicate to the debugger the local
     * symbols for a module that is being bound.
     *
     * The last two groups are used by the dynamic binding process to do the
     * binding (indirectly through the module table and the reference symbol
     * table when this is a dynamicly linked shared library file).
     */
    unsigned long ilocalsym;	/* index to local symbols */
    unsigned long nlocalsym;	/* number of local symbols */

    unsigned long iextdefsym;	/* index to externally defined symbols */
    unsigned long nextdefsym;	/* number of externally defined symbols */

    unsigned long iundefsym;	/* index to undefined symbols */
    unsigned long nundefsym;	/* number of undefined symbols */

    /*
     * For the for the dynamic binding process to find which module a symbol
     * is defined in the table of contents is used (analogous to the ranlib
     * structure in an archive) which maps defined external symbols to modules
     * they are defined in.  This exists only in a dynamicly linked shared
     * library file.  For executable and object modules the defined external
     * symbols are sorted by name and is use as the table of contents.
     */
    unsigned long tocoff;	/* file offset to table of contents */
    unsigned long ntoc;		/* number of entries in table of contents */

    /*
     * To support dynamic binding of "modules" (whole object files) the symbol
     * table must reflect the modules that the file was created from.  This is
     * done by having a module table that has indexes and counts into the merged
     * tables for each module.  The module structure that these two entries
     * refer to is described below.  This exists only in a dynamicly linked
     * shared library file.  For executable and object modules the file only
     * contains one module so everything in the file belongs to the module.
     */
    unsigned long modtaboff;	/* file offset to module table */
    unsigned long nmodtab;	/* number of module table entries */

    /*
     * To support dynamic module binding the module structure for each module
     * indicates the external references (defined and undefined) each module
     * makes.  For each module there is an offset and a count into the
     * reference symbol table for the symbols that the module references.
     * This exists only in a dynamicly linked shared library file.  For
     * executable and object modules the defined external symbols and the
     * undefined external symbols indicates the external references.
     */
    unsigned long extrefsymoff;  /* offset to referenced symbol table */
    unsigned long nextrefsyms;	 /* number of referenced symbol table entries */

    /*
     * The sections that contain "symbol pointers" and "routine stubs" have
     * indexes and (implied counts based on the size of the section and fixed
     * size of the entry) into the "indirect symbol" table for each pointer
     * and stub.  For every section of these two types the index into the
     * indirect symbol table is stored in the section header in the field
     * reserved1.  An indirect symbol table entry is simply a 32bit index into
     * the symbol table to the symbol that the pointer or stub is referring to.
     * The indirect symbol table is ordered to match the entries in the section.
     */
    unsigned long indirectsymoff; /* file offset to the indirect symbol table */
    unsigned long nindirectsyms;  /* number of indirect symbol table entries */

    /*
     * To support relocating an individual module in a library file quickly the
     * external relocation entries for each module in the library need to be
     * accessed efficiently.  Since the relocation entries can't be accessed
     * through the section headers for a library file they are separated into
     * groups of local and external entries further grouped by module.  In this
     * case the presents of this load command who's extreloff, nextrel,
     * locreloff and nlocrel fields are non-zero indicates that the relocation
     * entries of non-merged sections are not referenced through the section
     * structures (and the reloff and nreloc fields in the section headers are
     * set to zero).
     *
     * Since the relocation entries are not accessed through the section headers
     * this requires the r_address field to be something other than a section
     * offset to identify the item to be relocated.  In this case r_address is
     * set to the offset from the vmaddr of the first LC_SEGMENT command.
     *
     * The relocation entries are grouped by module and the module table
     * entries have indexes and counts into them for the group of external
     * relocation entries for that the module.
     *
     * For sections that are merged across modules there must not be any
     * remaining external relocation entries for them (for merged sections
     * remaining relocation entries must be local).
     */
    unsigned long extreloff;	/* offset to external relocation entries */
    unsigned long nextrel;	/* number of external relocation entries */

    /*
     * All the local relocation entries are grouped together (they are not
     * grouped by their module since they are only used if the object is moved
     * from it staticly link edited address).
     */
    unsigned long locreloff;	/* offset to local relocation entries */
    unsigned long nlocrel;	/* number of local relocation entries */

};复制代码



/*
 * A segment is made up of zero or more sections.  Non-MH_OBJECT files have
 * all of their segments with the proper sections in each, and padded to the
 * specified segment alignment when produced by the link editor.  The first
 * segment of a MH_EXECUTE and MH_FVMLIB format file contains the mach_header
 * and load commands of the object file before it's first section.  The zero
 * fill sections are always last in their segment (in all formats).  This
 * allows the zeroed segment padding to be mapped into memory where zero fill
 * sections might be. The gigabyte zero fill sections, those with the section
 * type S_GB_ZEROFILL, can only be in a segment with sections of this type.
 * These segments are then placed after all other segments.
 *
 * The MH_OBJECT format has all of it's sections in one segment for
 * compactness.  There is no padding to a specified segment boundary and the
 * mach_header and load commands are not part of the segment.
 *
 * Sections with the same section name, sectname, going into the same segment,
 * segname, are combined by the link editor.  The resulting section is aligned
 * to the maximum alignment of the combined sections and is the new section's
 * alignment.  The combined sections are aligned to their original alignment in
 * the combined section.  Any padded bytes to get the specified alignment are
 * zeroed.
 *
 * The format of the relocation entries referenced by the reloff and nreloc
 * fields of the section structure for mach object files is described in the
 * header file <reloc.h>.
 */
struct section {		/* for 32-bit architectures */
	char		sectname[16];	/* name of this section */               /*节的名字*/
	char		segname[16];	/* segment this section goes in */       /*节所在段名*/
	unsigned long	addr;		/* memory address of this section */     /*节所在地址*/
	unsigned long	size;		/* size in bytes of this section */      /*节的大小*/
	unsigned long	offset;		/* file offset of this section */        /*节的文件偏移*/
	unsigned long	align;		/* section alignment (power of 2) */     /*节的对齐*/
	unsigned long	reloff;		/* file offset of relocation entries */  //
	unsigned long	nreloc;		/* number of relocation entries */       //
	unsigned long	flags;		/* flags (section type and attributes)*/ //
	unsigned long	reserved1;	/* reserved */
	unsigned long	reserved2;	/* reserved */
};

struct section_64 { /* for 64-bit architectures */
	char		sectname[16];	/* name of this section */
	char		segname[16];	/* segment this section goes in */
	uint64_t	addr;		/* memory address of this section */
	uint64_t	size;		/* size in bytes of this section */
	uint32_t	offset;		/* file offset of this section */
	uint32_t	align;		/* section alignment (power of 2) */
	uint32_t	reloff;		/* file offset of relocation entries */
	uint32_t	nreloc;		/* number of relocation entries */
	uint32_t	flags;		/* flags (section type and attributes)*/
	uint32_t	reserved1;	/* reserved (for offset or index) */
	uint32_t	reserved2;	/* reserved (for count or sizeof) */
	uint32_t	reserved3;	/* reserved */
};复制代码


ASLR

ASLR 是 Address Space Layout Randomization 的缩写,这个概念在业界由来已久,并非苹果原创。由于 vmaddr (虚拟地址) 是链接器链接的时候写入 Mach-O 文件的,对于一个程序来说是静态不变的,因此给黑客攻击带来了便利,iOS 4.3 以后引入了 ASLR,给每个镜像在 vmaddr 的基础上再加一个随机的偏移量 slide,因此每段数据的真实的虚拟地址是 vmaddr + slide。


static void rebind_symbols_for_image(struct rebindings_entry *rebindings,  
                                     const struct mach_header *header,           
                                     intptr_t slide) {  
    Dl_info info;  
    if (dladdr(header, &info) == 0) {    
        return;  
    }
    segment_command_t *cur_seg_cmd;  
    segment_command_t *linkedit_segment = NULL;  
    struct symtab_command* symtab_cmd = NULL;  
    struct dysymtab_command* dysymtab_cmd = NULL;
    // 获取Load Command的起始位置
    uintptr_t cur = (uintptr_t)header + sizeof(mach_header_t);  
    // 便利每一个Command来获取几个表格的位置
    for (uint i = 0; i < header->ncmds; i++, cur += cur_seg_cmd->cmdsize) {    
        cur_seg_cmd = (segment_command_t *)cur;
        //在LC_SEGMENT 中遍历寻找__LINKEDIT的 Section    
        if (cur_seg_cmd->cmd == LC_SEGMENT_ARCH_DEPENDENT) {      
            if (strcmp(cur_seg_cmd->segname, SEG_LINKEDIT) == 0) {        
                linkedit_segment = cur_seg_cmd;      
            }    
        //遍历寻找LC_SYMTAB
        } else if (cur_seg_cmd->cmd == LC_SYMTAB) {      
            symtab_cmd = (struct symtab_command*)cur_seg_cmd;   
        //遍历寻找LC_DYSYMTAB 
        } else if (cur_seg_cmd->cmd == LC_DYSYMTAB) {      
            dysymtab_cmd = (struct dysymtab_command*)cur_seg_cmd;    
        }  
    }

    /* 检测必要的数据结构是否都存在 
     *  LC_SYMTAB这个LoadCommand主要提供了两个信息 
     *       Symbol Table的偏移量与Symbol Table中元素的个数
     *       String Table的偏移量与String Table的长度
     *   LC_DYSYMTAB提供了动态符号表的位移和元素个数,还有一些其他的表格索引
     *   LC_SEGMENT.__LINKEDIT 含有为动态链接库使用的原始数据
     */
    if (!symtab_cmd || !dysymtab_cmd || !linkedit_segment || !dysymtab_cmd->nindirectsyms) {    
        return;  
    }
 
    // 找到__LINKEDIT段的头地址 
    uintptr_t linkedit_base = (uintptr_t)slide + linkedit_segment->vmaddr - linkedit_segment->fileoff;  
    // 获取符号表的真实地址
    // 符号表的地址 = 基址 + 符号表偏移量
    nlist_t *symtab = (nlist_t *)(linkedit_base + symtab_cmd->symoff);  
    // 获取字符表的真实地址
    // 字符串表的地址 = 基址 + 字符串表偏移量
    char *strtab = (char *)(linkedit_base + symtab_cmd->stroff); 
    // 获取间接符号表的真实地址
    uint32_t *indirect_symtab = (uint32_t *)(linkedit_base + dysymtab_cmd->indirectsymoff);
    
    cur = (uintptr_t)header + sizeof(mach_header_t);  
    for (uint i = 0; i < header->ncmds; i++, cur += cur_seg_cmd->cmdsize) {    
        cur_seg_cmd = (segment_command_t *)cur;    
        if (cur_seg_cmd->cmd == LC_SEGMENT_ARCH_DEPENDENT) {      
            if (strcmp(cur_seg_cmd->segname, SEG_DATA) != 0 && strcmp(cur_seg_cmd->segname, SEG_DATA_CONST) != 0) {        
                continue;      
            }      
            for (uint j = 0; j < cur_seg_cmd->nsects; j++) {        
                section_t *sect = (section_t *)(cur + sizeof(segment_command_t)) + j;        
                if ((sect->flags & SECTION_TYPE) == S_LAZY_SYMBOL_POINTERS) {          
                    perform_rebinding_with_section(rebindings, sect, slide, symtab, strtab, indirect_symtab);        
                }        
                if ((sect->flags & SECTION_TYPE) == S_NON_LAZY_SYMBOL_POINTERS) {          
                    perform_rebinding_with_section(rebindings, sect, slide, symtab, strtab, indirect_symtab);        
                }      
            }    
        }  
    }
}

static void _rebind_symbols_for_image(const struct mach_header *header,  
                                      intptr_t slide) {    
    rebind_symbols_for_image(_rebindings_head, header, slide);
}复制代码



static void perform_rebinding_with_section(struct rebindings_entry *rebindings, 
                                           section_t *section, 
                                           intptr_t slide, 
                                           nlist_t *symtab, 
                                           char *strtab, 
                                           uint32_t *indirect_symtab) {  
    uint32_t *indirect_symbol_indices = indirect_symtab + section->reserved1;  
    void **indirect_symbol_bindings = (void **)((uintptr_t)slide + section->addr);  
    for (uint i = 0; i < section->size / sizeof(void *); i++) {    
        uint32_t symtab_index = indirect_symbol_indices[i];    
        if (symtab_index == INDIRECT_SYMBOL_ABS || symtab_index == INDIRECT_SYMBOL_LOCAL || symtab_index == (INDIRECT_SYMBOL_LOCAL | INDIRECT_SYMBOL_ABS)) {      
            continue;    
        }    
        uint32_t strtab_offset = symtab[symtab_index].n_un.n_strx;    
        char *symbol_name = strtab + strtab_offset;    
        bool symbol_name_longer_than_1 = symbol_name[0] && symbol_name[1];    
        struct rebindings_entry *cur = rebindings;    
        while (cur) {      
            for (uint j = 0; j < cur->rebindings_nel; j++) {        
                if (symbol_name_longer_than_1 && strcmp(&symbol_name[1], cur->rebindings[j].name) == 0) {          
                    if (cur->rebindings[j].replaced != NULL && indirect_symbol_bindings[i] != cur->rebindings[j].replacement) {            
                        *(cur->rebindings[j].replaced) = indirect_symbol_bindings[i];          
                    }          
                    indirect_symbol_bindings[i] = cur->rebindings[j].replacement;          
                    goto symbol_loop;        
                }      
            }      
            cur = cur->next;    
        }  
        symbol_loop:;  
    }
}复制代码


转载于:https://juejin.im/post/5cc2ad1a5188252e7f08ed4e

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值