Linux版本:v6.0
處理器架構:ARMv8
前言 在理解 Linux kernel 開機流程時,最難理解的應該就是記憶體映射的部份了,我認為困難的點在於處理記憶體映射的程式本身也在記憶體的映射之中,所以除了看懂程式邏輯之外,還必須理解該部份程式是在怎樣的映射環境中執行,以及他的操作會對記憶體影射造成什麼樣的影響等等。再加上變數中存的位址有些是虛擬位址有些是實體位址,更把事情給複雜化。
這篇介紹__create_pgd_mapping
,負責在開機階段建立各級頁表,主要著重在實作的分析,使用場景則先略過。
__create_pgd_mapping
執行環境在說明函式原型及實作之前,先說明一下執行環境,__create_pgd_mapping
是開機流程所使用的工具函式,記憶體的映射狀況為MMU已開啟,使用的pgd(大部分)為init_pg_dir
,linear mapping有可能尚未建立所以無法使用,fixmap除了level 3以外的頁表已經建立好,kernel跑在高虛擬位址。
__create_pgd_mapping
函式原型以下是__create_pgd_mapping
的函式原型:
1 2 3 4 5 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags);
__create_pgd_mapping
實作1 2 3 4 5 6 7 8 9 10 11 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { mutex_lock(&fixmap_lock); __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, pgtable_alloc, flags); mutex_unlock(&fixmap_lock); }
沒什麼好講的,取fixmap lock然後呼叫內部__create_pgd_mapping_locked
__create_pgd_mapping_locked
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { unsigned long addr, end, next; pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) return ; phys &= PAGE_MASK; addr = virt & PAGE_MASK; end = PAGE_ALIGN(virt + size); do { next = pgd_addr_end(addr, end); alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, flags); phys += next - addr; } while (pgdp++, addr = next, addr != end); }
alloc_init_pud
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 static void alloc_init_pud (pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { unsigned long next; pud_t *pudp; p4d_t *p4dp = p4d_offset(pgdp, addr); p4d_t p4d = READ_ONCE(*p4dp); if (p4d_none(p4d)) { p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; phys_addr_t pud_phys; if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); __p4d_populate(p4dp, pud_phys, p4dval); p4d = READ_ONCE(*p4dp); } BUG_ON(p4d_bad(p4d)); pudp = pud_set_fixmap_offset(p4dp, addr); do { pud_t old_pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (pud_sect_supported() && ((addr | next | phys) & ~PUD_MASK) == 0 && (flags & NO_BLOCK_MAPPINGS) == 0 ) { pud_set_huge(pudp, phys, prot); BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), READ_ONCE(pud_val(*pudp)))); } else { alloc_init_cont_pmd(pudp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); } phys += next - addr; } while (pudp++, addr = next, addr != end); pud_clear_fixmap(); }
看到這邊要來解釋一下函式邏輯,目前看到的call chain長這樣:
1 2 3 4 5 6 7 8 __create_pgd_mapping --> __create_pgd_mapping_locked --> alloc_init_pud // 以下下面會說明 --> alloc_init_cont_pmd --> init_pmd --> alloc_init_cont_pte --> init_pte
上面分析說明了:
__create_pgd_mapping_locked
負責填充pgd頁
alloc_init_pud
負責填充pud頁
那為什麼pmd頁和pte頁各需要兩個函數來處理呢?原因是ARMv8架構中頁表項有個contiguous bit,簡單來說如果一段連續的虛擬位址會經過頁表轉換出另一段連續的物理位址,軟體可以設置這個bit來優化TLB的表現,alloc_init_cont_pmd
和 alloc_init_cont_pte
盡量把傳入的位址範圍使用contiguous bit來建立mapping,而init_pmd
和init_pte
負責實際的pmd頁和pte頁。
alloc_init_cont_pmd
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 static void alloc_init_cont_pmd (pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); BUG_ON(pud_sect(pud)); if (pud_none(pud)) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN; phys_addr_t pmd_phys; if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(PMD_SHIFT); __pud_populate(pudp, pmd_phys, pudval); pud = READ_ONCE(*pudp); } BUG_ON(pud_bad(pud)); do { pgprot_t __prot = prot; next = pmd_cont_addr_end(addr, end); if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0 ) && (flags & NO_CONT_MAPPINGS) == 0 ) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags); phys += next - addr; } while (addr = next, addr != end); }
init_pmd
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 static void init_pmd (pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { unsigned long next; pmd_t *pmdp; pmdp = pmd_set_fixmap_offset(pudp, addr); do { pmd_t old_pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); if (((addr | next | phys) & ~PMD_MASK) == 0 && (flags & NO_BLOCK_MAPPINGS) == 0 ) { pmd_set_huge(pmdp, phys, prot); BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), READ_ONCE(pmd_val(*pmdp)))); } else { alloc_init_cont_pte(pmdp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pmd_val(old_pmd) != 0 && pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); } phys += next - addr; } while (pmdp++, addr = next, addr != end); pmd_clear_fixmap(); }
接下來alloc_init_cont_pte
和init_pte
的操作邏輯跟pmd基本一樣,可以自己研究看看:
alloc_init_cont_pte
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 static void alloc_init_cont_pte (pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int ), int flags) { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN; phys_addr_t pte_phys; if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(PAGE_SHIFT); __pmd_populate(pmdp, pte_phys, pmdval); pmd = READ_ONCE(*pmdp); } BUG_ON(pmd_bad(pmd)); do { pgprot_t __prot = prot; next = pte_cont_addr_end(addr, end); if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0 ) && (flags & NO_CONT_MAPPINGS) == 0 ) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); init_pte(pmdp, addr, next, phys, __prot); phys += next - addr; } while (addr = next, addr != end); }
init_pte
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 static void init_pte (pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { pte_t *ptep; ptep = pte_set_fixmap_offset(pmdp, addr); do { pte_t old_pte = READ_ONCE(*ptep); set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), READ_ONCE(pte_val(*ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); }