Linux版本:v6.0
處理器架構:ARMv8
前言 在Linux kernel 5.10週期,KVM ARM開發者們為了為google pkvm 做準備,在code base許多地方做了翻修,今天就是介紹其中新設計的page table walker。
原先在KVM ARM中在做page table walk的時候,寫法就是單純的在需要的地方直接一路access然後dereference下去,e.g. create_hyp_{p4d, pud, pmd, pte}_mappings
這幾個函式。這樣做的缺點之一就是軟體在存取page tables時code很難重複使用,而新的作法在5.10中出現,把存取page table這樣的操作模組化,各個需要存取page table的地方都能共用同樣的程式。
重要結構 使用新的page table walker時,需要提供一些資訊:
所要存取的page table (struct kvm_pgtable
)
想要對page table進行的操作,以及walk到哪裡時進行 (struct kvm_pgtable_walker
)
訪問哪個虛擬位址範圍 (struct kvm_pgtable_walk_data
)
見以下說明:
kvm_pgtable
紀錄一整個page table tree的metadata,以下簡單說明幾個重要的成員
1 2 3 4 5 6 7 8 9 10 11 struct kvm_pgtable { u32 ia_bits; u32 start_level; kvm_pte_t *pgd; struct kvm_pgtable_mm_ops *mm_ops ; struct kvm_s2_mmu *mmu ; enum kvm_pgtable_stage2_flags flags ; kvm_pgtable_force_pte_cb_t force_pte_cb; };
kvm_pgtable_walker
提供使用者設定訪問page table時呼叫的函數以及何時呼叫,cb
代表”call back”
1 2 3 4 5 6 7 8 struct kvm_pgtable_walker { const kvm_pgtable_visitor_fn_t cb; void * const arg; const enum kvm_pgtable_walk_flags flags ; };
kvm_pgtable_walk_data
想要訪問的位址區間,可以看出kvm_pgtable_walk_data
實際上包含了前兩者的資訊
1 2 3 4 5 6 7 struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt ; struct kvm_pgtable_walker *walker ; u64 addr; u64 end; };
新walker實作 接著就可以來看walker的入口點kvm_pgtable_walk
:
kvm_pgtable_walk
1 2 3 4 5 6 7 8 9 10 11 12 int kvm_pgtable_walk (struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { .pgt = pgt, .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; return _kvm_pgtable_walk(&walk_data); }
這個函式預期pgt
和walker
caller已經準備好了,再加上利用傳進來的addr
和size
製作出這次所使用的kvm_pgtalbe_walk_data
,然後呼叫_kvm_pgtable_walk
。
_kvm_pgtable_walk
這個函式負責各個root page,見註解
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data){ u32 idx; int ret = 0 ; struct kvm_pgtable *pgt = data->pgt; u64 limit = BIT(pgt->ia_bits); if (data->addr > limit || data->end > limit) return -ERANGE; if (!pgt->pgd) return -EINVAL; for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); if (ret) break ; } return ret; }
__kvm_pgtable_walk
這個函式loop過一個page的entries
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, > kvm_pte_t *pgtable, u32 level) { u32 idx; int ret = 0 ; if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { kvm_pte_t *ptep = &pgtable[idx]; if (data->addr >= data->end) break ; ret = __kvm_pgtable_visit(data, ptep, level); if (ret) break ; } return ret; }
__kvm_pgtable_visit
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, kvm_pte_t *ptep, u32 level) { int ret = 0 ; u64 addr = data->addr; kvm_pte_t *childp, pte = *ptep; bool table = kvm_pte_table(pte, level); enum kvm_pgtable_walk_flags flags = data->walker->flags; if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, KVM_PGTABLE_WALK_TABLE_PRE); } if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, KVM_PGTABLE_WALK_LEAF); pte = *ptep; table = kvm_pte_table(pte, level); } if (ret) goto out; if (!table) { data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } childp = kvm_pte_follow(pte, data->pgt->mm_ops); ret = __kvm_pgtable_walk(data, childp, level + 1 ); if (ret) goto out; if (flags & KVM_PGTABLE_WALK_TABLE_POST) { ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, KVM_PGTABLE_WALK_TABLE_POST); } out: return ret; }
Page table其實是一個多child的樹狀結構(4K page就是512個child),這個page table walker使用了遞迴的方式來對其進行操作,還支持pre-order, post-order的邏輯,level
代表第幾層,__kvm_pgtable_visit
負責:
在適當的條件對table進行操作(呼叫callbacks)
推進data->addr
的進度
遇到table
的時候遞迴呼叫__kvm_pgtable_walk
而__kvm_pgtable_walk
負責一個page table裡面的entry的loop。
使用範例:create_hyp_mappings
Linux在初始化KVM的時候的執行模式是EL1,此時需要在進入EL2之前為其製作和設定好EL2所使用的page tables,使用的函式就是create_hyp_mappings
。
create_hyp_mappings
在KVM初始化重點函式之一init_hyp_mode
中被多次呼叫,分別替EL2 建立了以下幾個區域的mappings:
EL2 code (__hyp_text_start
~ __hyp_text_end
)
EL2 read only data (__hyp_rodata_start
~ __hyp_rodata_end
)
EL1 read only data (__start_rodata
~ __end_rodata
)
EL2 BSS (__hyp_bss_start
~ __hyp_bss_end
)
EL1 BSS (__hyp_bss_end
~ __bss_stop
)
EL2 stack
EL2 percpu area
這個函式只是建立page tables,並不會進到EL2啟動EL2的address translation機制
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 int create_hyp_mappings (void *from, void *to, enum kvm_pgtable_prot prot) { phys_addr_t phys_addr; unsigned long virt_addr; unsigned long start = kern_hyp_va((unsigned long )from); unsigned long end = kern_hyp_va((unsigned long )to); if (is_kernel_in_hyp_mode()) return 0 ; if (!kvm_host_owns_hyp_mappings()) return -EPERM; start = start & PAGE_MASK; end = PAGE_ALIGN(end); for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, prot); if (err) return err; } return 0 ; }
註解其實就對這個函式有不少說明,作為輸入的from
, to
是想要map給EL2的EL1虛擬位址區間,prot
則是讀寫執行等權限設定。有趣的是並不需要指定要map給EL2的虛擬位址,EL2的虛擬位址規劃KVM自身有機制決定,具體來說就是利用kern_hyp_va
把EL1的虛擬位址轉換成EL2的虛擬位址。
__create_hyp_mappings
鎖上kvm_hyp_pgd_mutex
然後呼叫kvm_pgtable_hyp_map
,注意hyp_pgtable
即為page table walk所需的kvm_pgtable
1 2 3 4 5 6 7 8 9 10 11 12 13 14 int __create_hyp_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { int err; if (WARN_ON(!kvm_host_owns_hyp_mappings())) return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); return err; }
kvm_pgtable_hyp_map
這個函式就會呼叫新的page table walker kvm_pgtable_walk
了,呼叫之前製作所需的kvm_pgtable_walker
(1),和傳入的hyp_pgtable
(參數pgt
)一起傳給kvm_pgtable_walk
(2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 int kvm_pgtable_hyp_map (struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot) { int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; ret = hyp_set_prot_attr(prot, &map_data.attr); if (ret) return ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); isb(); return ret; }
可想而知,作為cb
,只在碰到葉節點(flags: KVM_PGTABLE_WALK_LEAF
)會被呼叫的hyp_map_walker
就會負責申請各級的page tables並安裝進適當的page table entries。