__INITDATANEXT_PAGE(early_level4_pgt).fill511,8,0.quadlevel3_kernel_pgt-__START_KERNEL_map+_PAGE_TABLENEXT_PAGE(early_dynamic_pgts).fill512*EARLY_DYNAMIC_PAGE_TABLES,8,0.dataNEXT_PAGE(init_level4_pgt).fill512,8,0NEXT_PAGE(level3_kernel_pgt).fillL3_START_KERNEL,8,0/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */.quadlevel2_kernel_pgt-__START_KERNEL_map+_KERNPG_TABLE.quadlevel2_fixmap_pgt-__START_KERNEL_map+_PAGE_TABLENEXT_PAGE(level2_kernel_pgt)PMDS(0,__PAGE_KERNEL_LARGE_EXEC,KERNEL_IMAGE_SIZE/PMD_SIZE)NEXT_PAGE(level2_fixmap_pgt).fill506,8,0.quadlevel1_fixmap_pgt-__START_KERNEL_map+_PAGE_TABLE/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */.fill5,8,0NEXT_PAGE(level1_fixmap_pgt).fill512,8,0
这段数据结构还是比较清楚的,你把下面这两个宏NEXT_PAGE和PMDS代入上面的数据结构:
1234567891011
#define NEXT_PAGE(name) \ .balign PAGE_SIZE; \GLOBAL(name)/* Automate the creation of 1 to 1 mapping pmd entries */#define PMDS(START, PERM, COUNT) \ i = 0 ; \ .rept (COUNT) ; \ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ i = i + 1 ; \ .endr
我们就可以很轻易地画出下面这张图:
后面的初始化过程,就是建立在这个早期的页表结构中的。
正式进入startup_64
我们一段段来分析:
1234567891011121314151617181920
startup_64:/* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */leaq_text(%rip),%rbpsubq$_text-__START_KERNEL_map,%rbp/* Is the address not 2M aligned? */movq%rbp,%raxandl$~PMD_PAGE_MASK,%eaxtestl%eax,%eaxjnzbad_address/* * Is the address too large? */leaq_text(%rip),%raxshrq$MAX_PHYSMEM_BITS,%raxjnzbad_address
/* * Fixup the physical addresses in the page table */addq%rbp,early_level4_pgt+(L4_START_KERNEL*8)(%rip)addq%rbp,level3_kernel_pgt+(510*8)(%rip)addq%rbp,level3_kernel_pgt+(511*8)(%rip)addq%rbp,level2_fixmap_pgt+(506*8)(%rip)
/* * Set up the identity mapping for the switchover. These * entries should *NOT* have the global bit set! This also * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */leaq_text(%rip),%rdileaqearly_level4_pgt(%rip),%rbxmovq%rdi,%raxshrq$PGDIR_SHIFT,%raxleaq(4096+_KERNPG_TABLE)(%rbx),%rdxmovq%rdx,0(%rbx,%rax,8)movq%rdx,8(%rbx,%rax,8)addq$4096,%rdxmovq%rdi,%raxshrq$PUD_SHIFT,%raxandl$(PTRS_PER_PUD-1),%eaxmovq%rdx,4096(%rbx,%rax,8)incl%eaxandl$(PTRS_PER_PUD-1),%eaxmovq%rdx,4096(%rbx,%rax,8)addq$8192,%rbxmovq%rdi,%raxshrq$PMD_SHIFT,%rdiaddq$(__PAGE_KERNEL_LARGE_EXEC&~_PAGE_GLOBAL),%raxleaq(_end-1)(%rip),%rcxshrq$PMD_SHIFT,%rcxsubq%rdi,%rcxincl%ecx1:andq$(PTRS_PER_PMD-1),%rdimovq%rax,(%rbx,%rdi,8)incq%rdiaddq$PMD_SIZE,%raxdecl%ecxjnz1b
即紧接着early_level4_pgt,被称为early_dynamic_pgts。这个就是所谓的identity mapping for the switchover,表示在之后的一小段页表转换过程中会被用到的identity mapping。因为在页表中虚拟地址从低地址到高地址转换的过程中不可避免的会通过低位的虚拟地址进行索引,所以需要预先做个identity mapping的准备。
至此,页表变成了这个样子。
startup_64最后一步就是fixup内核段真正的物理页对应的页表项了,代码如下所示:
12345678910111213141516171819202122
/* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. */leaqlevel2_kernel_pgt(%rip),%rdileaq4096(%rdi),%r8/* See if it is a valid page table entry */1:testq$1,0(%rdi)jz2faddq%rbp,0(%rdi)/* Go to the next page */2:addq$8,%rdicmp%r8,%rdijne1b/* Fixup phys_base */addq%rbp,phys_base(%rip)movq$(early_level4_pgt-__START_KERNEL_map),%raxjmp1f
/* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */lgdtearly_gdt_descr(%rip)
初始化段寄存器:
1234567891011121314151617181920
/* set up data segments */xorl%eax,%eaxmovl%eax,%dsmovl%eax,%ssmovl%eax,%esmovl%eax,%fsmovl%eax,%gs/* Set up %gs. * * The base of %gs always points to the bottom of the irqstack * union. If the stack protector canary is enabled, it is * located at %gs:40. Note that, on SMP, the boot cpu uses * init data section till per cpu areas are set up. */movl$MSR_GS_BASE,%ecxmovlinitial_gs(%rip),%eaxmovlinitial_gs+4(%rip),%edxwrmsr
/* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. */movqinitial_code(%rip),%raxpushq$0#fakereturnaddresstostopunwinderpushq$__KERNEL_CS#setcorrectcspushq%rax#targetaddressinnegativespacelretq