start_kernel——setup_per_cpu_areas

setup_per_cpu_areas是为了对内核的内存管理(mm)进行初始化而调用的函数之一。只在SMP系统中调用,UP中不执行任何操作。
setup_per_cpu_areas函数为SMP的每个处理器生成per-cpu数据。
per-cpu数据按照不同的CPU类别使用,以将性能低下引发的缓存一致性(cache coherency)问题减小到最小。per-cpu数据由各cpu独立使用,即使不锁也可访问,十分有效。
以下摘抄自:http://blog.csdn.net/yunsongice/article/details/5605239

每CPU变量主要是数据结构的数组,系统的每个CPU对应数组的一个元素。

一个CPU不应该访问与其他CPU对应的数组元素,另外,它可以随意读或修改它自己的元素而不用担心出现竞争条件,因为它是唯一有资格这么做的CPU。但是,这也意味着每CPU变量基本上只能在特殊情况下使用,也就是当它确定在系统的CPU上的数据在逻辑上是独立的时候。

每CPU的数组元素在主存中被排列以使每个数据结构存放在硬件高速缓存的不同行,因此,对每CPU数组的并发访问不会导致高速缓存行的窃用和失效(这种操作会带来昂贵的系统开销)。

虽然每CPU变量为来自不同CPU的并发访问提供保护,但对来自异步函数(中断处理程序和可延迟函数)的访问不提供保护,在这种情况下需要另外的同步技术。

此外,在单处理器和多处理器系统中,内核抢占都可能使每CPU变量产生竞争条件。总的原则是内核控制路径应该在禁用抢占的情况下访问每CPU变量。因为当一个内核控制路径获得了它的每CPU变量本地副本的地址,然后它因被抢占而转移到另外一个CPU上,但仍然引用原来CPU元素的地址,这是非常危险的。

init/main.c中的setup_per_cpu_areas调用arch/x86/kernel/setup_percpu.c

void __init setup_per_cpu_areas(void)
{
    unsigned int cpu;
    unsigned long delta;
    int rc;

    printk(KERN_EMERG "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
        NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

    /*
     * Allocate percpu area.  Embedding allocator is our favorite;
     * however, on NUMA configurations, it can result in very
     * sparse unit mapping and vmalloc area isn't spacious enough
     * on 32bit.  Use page in that case.
     */
#ifdef CONFIG_X86_32
    if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
        pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
    rc = -EINVAL;
    if (pcpu_chosen_fc != PCPU_FC_PAGE) {
        const size_t dyn_size = PERCPU_MODULE_RESERVE +
            PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
        size_t atom_size;

        /*
         * On 64bit, use PMD_SIZE for atom_size so that embedded
         * percpu areas are aligned to PMD.  This, in the future,
         * can also allow using PMD mappings in vmalloc area.  Use
         * PAGE_SIZE on 32bit as vmalloc space is highly contended
         * and large vmalloc area allocs can easily fail.
         */
#ifdef CONFIG_X86_64
        atom_size = PMD_SIZE;
#else
        atom_size = PAGE_SIZE;
#endif
        rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
                        dyn_size, atom_size,
                        pcpu_cpu_distance,
                        pcpu_fc_alloc, pcpu_fc_free);
        if (rc < 0)
            pr_warning("%s allocator failed (%d), falling back to page size\n",
                   pcpu_fc_names[pcpu_chosen_fc], rc);
    }
    if (rc < 0)
        rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
                       pcpu_fc_alloc, pcpu_fc_free,
                       pcpup_populate_pte);
    if (rc < 0)
        panic("cannot initialize percpu area (err=%d)", rc);

    /* alrighty, percpu areas up and running */
    delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
    for_each_possible_cpu(cpu) {
        per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
        per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
        per_cpu(cpu_number, cpu) = cpu;
        setup_percpu_segment(cpu);
        setup_stack_canary_segment(cpu);
        /*
         * Copy data used in early init routines from the
         * initial arrays to the per cpu data areas.  These
         * arrays then become expendable and the *_early_ptr's
         * are zeroed indicating that the static arrays are
         * gone.
         */
#ifdef CONFIG_X86_LOCAL_APIC
        per_cpu(x86_cpu_to_apicid, cpu) =
            early_per_cpu_map(x86_cpu_to_apicid, cpu);
        per_cpu(x86_bios_cpu_apicid, cpu) =
            early_per_cpu_map(x86_bios_cpu_apicid, cpu);
#endif
#ifdef CONFIG_X86_32
        per_cpu(x86_cpu_to_logical_apicid, cpu) =
            early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif
#ifdef CONFIG_X86_64
        per_cpu(irq_stack_ptr, cpu) =
            per_cpu(irq_stack_union.irq_stack, cpu) +
            IRQ_STACK_SIZE - 64;
#endif
#ifdef CONFIG_NUMA
        per_cpu(x86_cpu_to_node_map, cpu) =
            early_per_cpu_map(x86_cpu_to_node_map, cpu);
        /*
         * Ensure that the boot cpu numa_node is correct when the boot
         * cpu is on a node that doesn't have memory installed.
         * Also cpu_up() will call cpu_to_node() for APs when
         * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
         * up later with c_init aka intel_init/amd_init.
         * So set them all (boot cpu and all APs).
         */
        set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
#endif
        /*
         * Up to this point, the boot CPU has been using .init.data
         * area.  Reload any changed state for the boot CPU.
         */
        if (!cpu)
            switch_to_new_gdt(cpu);
    }

    /* indicate the early static arrays will soon be gone */
#ifdef CONFIG_X86_LOCAL_APIC
    early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
    early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
#endif
#ifdef CONFIG_X86_32
    early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
#endif
#ifdef CONFIG_NUMA
    early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif

    /* Setup node to cpumask map */
    setup_node_to_cpumask_map();

    /* Setup cpu initialized, callin, callout masks */
    setup_cpu_local_masks();
}

打印Log

NR_CPUS:4 nr_cpumask_bits:4 nr_cpu_ids:4 nr_node_ids:1


每CPU变量的声明和普通变量的声明一样,主要的区别是使用了attribute((section(PER_CPU_BASE_SECTION sec)))来指定该变量被放置的段中,普通变量默认会被放置data段或者bss段中。
看到这里有一个问题:如果我们只是声明了一个变量,那么如果有多个副本的呢?奥妙在于内核加载的过程。
一般情况下,ELF文件中的每一个段在内存中只会有一个副本,而.data..percpu段再加载后,又被复制了NR_CPUS次,一个每CPU变量的多个副本在内存中是不会相邻。
分配内存以及复制.data.percup内容的工作由pcpu_embed_first_chunk来完成,这里就不展开了。__per_cpu_offset数组中记录了每个CPU的percpu区域的开始地址。我们访问每CPU变量就要依靠__per_cpu_offset中的地址。“(摘录自:http://blog.csdn.net/fengtaocat/article/details/7078472)
PAGE_SIZE的大小为4K,实现代码如下:
include/asm/page_types.h

/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT  12
#define PAGE_SIZE   (_AC(1,UL) << PAGE_SHIFT)

include/uapi/linux/const.h

/* Some constant macros are used in both assembler and
 * C code.  Therefore we cannot annotate them always with
 * 'UL' and other type specifiers unilaterally.  We
 * use the following macros to deal with this.
 *
 * Similarly, _AT() will cast an expression with a type in C, but
 * leave it unchanged in asm.
 */

#ifdef __ASSEMBLY__
#define _AC(X,Y)    X
#define _AT(T,X)    X
#else   //__ASSEMBLY__ not defined
#define __AC(X,Y)   (X##Y)
#define _AC(X,Y)    __AC(X,Y)
#define _AT(T,X)    ((T)(X))
#endif

“某些常量宏会同时出现被c和asm引用,而c与asm在对立即数符号的处理上是不同的。asm中通过指令来区分其操作数是有符号还是无符号,而不是通过操作数。而c中是通过变量的属性,而不是通过操作符。c中如要指名常量有无符号,必须为常量添加后缀,而asm则通过使用不同的指令来指明。如此,当一个常量被c和asm同时包含时,必须作不同的处理。故AFLAGS中将添加一项D__ASSEMBLY__,来告知预处理器此时是asm”(http://my.oschina.net/u/930588/blog/134751)

原理已经很清楚,具体代码不再分析了。

阅读更多

start_kernel

10-29

[ 注:内核版本Linux-2.6.30 ]rnrnsetup_arch执行是由start_kernel来调用的:rnrnstart_kernel [ init/main.c ] --> setup_arch(&command_line) [arch/arm/kernel/setup.c ] rnrn[cpp] view plaincopyrnvoid __init setup_arch(char **cmdline_p) rn rn struct tag *tags = (struct tag *)&init_tags; rn struct machine_desc *mdesc; rn char *from = default_command_line; /* [Voice] configuration in defconfig file */ rn rn unwind_init(); rn rn setup_processor(); rn mdesc = setup_machine(machine_arch_type); /* [Voice] This get from include/generated/mach-types.h */ rn machine_name = mdesc->name; rn rn if (mdesc->soft_reboot) rn reboot_setup("s"); rn rn if (__atags_pointer) /* [Voice] unsigned int __atags_pointer __initdata;*/ rn tags = phys_to_virt(__atags_pointer); rn else if (mdesc->boot_params) rn tags = phys_to_virt(mdesc->boot_params); rn rn /* rn * If we have the old style parameters, convert them to rn * a tag list. rn */ rn if (tags->hdr.tag != ATAG_CORE) rn convert_to_tag_list(tags); rn if (tags->hdr.tag != ATAG_CORE) rn tags = (struct tag *)&init_tags; rn rn if (mdesc->fixup) rn mdesc->fixup(mdesc, tags, &from, &meminfo); rn rn if (tags->hdr.tag == ATAG_CORE) rn if (meminfo.nr_banks != 0) rn squash_mem_tags(tags); rn save_atags(tags); rn parse_tags(tags); rn rn rn init_mm.start_code = (unsigned long) _text; rn init_mm.end_code = (unsigned long) _etext; rn init_mm.end_data = (unsigned long) _edata; rn init_mm.brk = (unsigned long) _end; rn rn memcpy(boot_command_line, from, COMMAND_LINE_SIZE); rn boot_command_line[COMMAND_LINE_SIZE-1] = '\0'; rn parse_cmdline(cmdline_p, from); rn paging_init(mdesc); rn request_standard_resources(&meminfo, mdesc); rn rn#ifdef CONFIG_SMP rn smp_init_cpus(); rn#endif rn rn cpu_init(); rn rn /* rn * Set up various architecture-specific pointers rn */ rn init_arch_irq = mdesc->init_irq; rn system_timer = mdesc->timer; rn init_machine = mdesc->init_machine; rn rn#ifdef CONFIG_VT rn#if defined(CONFIG_VGA_CONSOLE) rn conswitchp = &vga_con; rn#elif defined(CONFIG_DUMMY_CONSOLE) rn conswitchp = &dummy_con; rn#endif rn#endif rn early_trap_init(); rn rnrn以下进行逐行分析:rn1. default_command_linernrn static char default_command_line[COMMAND_LINE_SIZE] __initdata = CONFIG_CMDLINE;rnrn 其中的CONFIG_CMDLINE来自于board的默认配置选项。rnrn2. setup_processor();rnrn 首先读取cpuid,read_cpuid_id() --> read_cpuid(CPUID_ID), 其中CPUID_ID为0, 而read_cpuid为一个汇编代码。如下:rnrn[cpp] view plaincopyrn#define read_cpuid(reg) \ rn( \ rnunsigned int __val; \ rnasm("mrc p15, 0, %0, c0, c0, " __stringify(reg) \ rn: "=r" (__val) \ rn: \ rn: "cc"); \ rn__val; \ rn) rn 然后执行lookup_processor_type(); 其定义在:[ arch/arm/kernel/head-common.S ]rnrn cpu_name, elf_platform, elf_hwcap 变量得到赋值。rnrn 接着是: cacheid_init()rnrn 读取CPU的cachetype和arch结构,如果arch结构小于ARMv6,则cacheid = CACHEID_VIVT。rnrn 最后执行:cpu_proc_init()rnrn [此处比较迷茫]rnrn3. mdesc = setup_machine(machine_arch_type)rnrn machine_arch_type来自于文件:include/generated/mach-types.h [ 此文件是生成文件 ]rnrn 这个只是对应一个number,然后通过这个number去查找到相关信息。每一种板子对应于一个特定的number。然后相关的描述来于MACHINE_START和MACHINE_END。rnrn 接着:setup_machine --> lookup_machine_type(nr) [ 定义于head.S ]rnrn machine_name 变量得到赋值。rnrn4. phys_to_virt(mdesc->bootparams)rnrn phys_to_virt(mdesc->bootparams) --> __phys_to_virt((usigned long)(x)) --> ((x) - PHYS_OFFSET + PAGE_OFFSET)rnrn #ifndef PHYS_OFFSETrn #define PHYS_OFFSET (CONFIG_DRAM_BASE)rn #endifrnrn #ifndef PAGE_OFFSETrn #define PAGE_OFFSET (PHYS_OFFSET)rn #endif rnrn5. _text, _etext, _edata, _end rnrn 这四个参数来自于文件:[ arch/arm/kernel/vmlinux.lds.S ]rnrn init_mm部份参数得到赋值。rnrn6. parse_cmdline(cmdline_p, from)rnrnrn7. paging_init(mdesc)rnrn pageing_init() sets up the page tables, initialises the zone memory maps, and sets up the zero page, bad page and bad page tables.rnrn8. request_standard_resources(&meminfo, mdesc)rnrnrn9. cpu_init()rnrnrn10. CONFIG_VT && CONFIG_DUMMY_CONSOLErnrn conswitchp = &dummy_con; [ conswitchp ==> drivers/char/vt.c;dummy_con ==> driver/video/console/dummycon.c ]rnrnrn11. early_trap_init() [ arch/arm/kernel/traps.c ]

没有更多推荐了,返回首页