kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU
[linux.git] / arch / x86 / kvm / mmu / tdp_mmu.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9
10 static bool __read_mostly tdp_mmu_enabled = false;
11
12 static bool is_tdp_mmu_enabled(void)
13 {
14 #ifdef CONFIG_X86_64
15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
16 #else
17         return false;
18 #endif /* CONFIG_X86_64 */
19 }
20
21 /* Initializes the TDP MMU for the VM, if enabled. */
22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
23 {
24         if (!is_tdp_mmu_enabled())
25                 return;
26
27         /* This should not be changed for the lifetime of the VM. */
28         kvm->arch.tdp_mmu_enabled = true;
29
30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
32 }
33
34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
35 {
36         if (!kvm->arch.tdp_mmu_enabled)
37                 return;
38
39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
40 }
41
42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
44
45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
46 {
47         struct kvm_mmu_page *sp;
48
49         sp = to_shadow_page(hpa);
50
51         return sp->tdp_mmu_page && sp->root_count;
52 }
53
54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
55                           gfn_t start, gfn_t end, bool can_yield);
56
57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
58 {
59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
60
61         lockdep_assert_held(&kvm->mmu_lock);
62
63         WARN_ON(root->root_count);
64         WARN_ON(!root->tdp_mmu_page);
65
66         list_del(&root->link);
67
68         zap_gfn_range(kvm, root, 0, max_gfn, false);
69
70         free_page((unsigned long)root->spt);
71         kmem_cache_free(mmu_page_header_cache, root);
72 }
73
74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
75                                                    int level)
76 {
77         union kvm_mmu_page_role role;
78
79         role = vcpu->arch.mmu->mmu_role.base;
80         role.level = level;
81         role.direct = true;
82         role.gpte_is_8_bytes = true;
83         role.access = ACC_ALL;
84
85         return role;
86 }
87
88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
89                                                int level)
90 {
91         struct kvm_mmu_page *sp;
92
93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
96
97         sp->role.word = page_role_for_level(vcpu, level).word;
98         sp->gfn = gfn;
99         sp->tdp_mmu_page = true;
100
101         return sp;
102 }
103
104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
105 {
106         union kvm_mmu_page_role role;
107         struct kvm *kvm = vcpu->kvm;
108         struct kvm_mmu_page *root;
109
110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
111
112         spin_lock(&kvm->mmu_lock);
113
114         /* Check for an existing root before allocating a new one. */
115         for_each_tdp_mmu_root(kvm, root) {
116                 if (root->role.word == role.word) {
117                         kvm_mmu_get_root(kvm, root);
118                         spin_unlock(&kvm->mmu_lock);
119                         return root;
120                 }
121         }
122
123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
124         root->root_count = 1;
125
126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
127
128         spin_unlock(&kvm->mmu_lock);
129
130         return root;
131 }
132
133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
134 {
135         struct kvm_mmu_page *root;
136
137         root = get_tdp_mmu_vcpu_root(vcpu);
138         if (!root)
139                 return INVALID_PAGE;
140
141         return __pa(root->spt);
142 }
143
144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
145                                 u64 old_spte, u64 new_spte, int level);
146
147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
148 {
149         return sp->role.smm ? 1 : 0;
150 }
151
152 /**
153  * handle_changed_spte - handle bookkeeping associated with an SPTE change
154  * @kvm: kvm instance
155  * @as_id: the address space of the paging structure the SPTE was a part of
156  * @gfn: the base GFN that was mapped by the SPTE
157  * @old_spte: The value of the SPTE before the change
158  * @new_spte: The value of the SPTE after the change
159  * @level: the level of the PT the SPTE is part of in the paging structure
160  *
161  * Handle bookkeeping that might result from the modification of a SPTE.
162  * This function must be called for all TDP SPTE modifications.
163  */
164 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
165                                 u64 old_spte, u64 new_spte, int level)
166 {
167         bool was_present = is_shadow_present_pte(old_spte);
168         bool is_present = is_shadow_present_pte(new_spte);
169         bool was_leaf = was_present && is_last_spte(old_spte, level);
170         bool is_leaf = is_present && is_last_spte(new_spte, level);
171         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
172         u64 *pt;
173         struct kvm_mmu_page *sp;
174         u64 old_child_spte;
175         int i;
176
177         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
178         WARN_ON(level < PG_LEVEL_4K);
179         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
180
181         /*
182          * If this warning were to trigger it would indicate that there was a
183          * missing MMU notifier or a race with some notifier handler.
184          * A present, leaf SPTE should never be directly replaced with another
185          * present leaf SPTE pointing to a differnt PFN. A notifier handler
186          * should be zapping the SPTE before the main MM's page table is
187          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
188          * thread before replacement.
189          */
190         if (was_leaf && is_leaf && pfn_changed) {
191                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
192                        "SPTE with another present leaf SPTE mapping a\n"
193                        "different PFN!\n"
194                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
195                        as_id, gfn, old_spte, new_spte, level);
196
197                 /*
198                  * Crash the host to prevent error propagation and guest data
199                  * courruption.
200                  */
201                 BUG();
202         }
203
204         if (old_spte == new_spte)
205                 return;
206
207         /*
208          * The only times a SPTE should be changed from a non-present to
209          * non-present state is when an MMIO entry is installed/modified/
210          * removed. In that case, there is nothing to do here.
211          */
212         if (!was_present && !is_present) {
213                 /*
214                  * If this change does not involve a MMIO SPTE, it is
215                  * unexpected. Log the change, though it should not impact the
216                  * guest since both the former and current SPTEs are nonpresent.
217                  */
218                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
219                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
220                                "should not be replaced with another,\n"
221                                "different nonpresent SPTE, unless one or both\n"
222                                "are MMIO SPTEs.\n"
223                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
224                                as_id, gfn, old_spte, new_spte, level);
225                 return;
226         }
227
228
229         if (was_leaf && is_dirty_spte(old_spte) &&
230             (!is_dirty_spte(new_spte) || pfn_changed))
231                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
232
233         /*
234          * Recursively handle child PTs if the change removed a subtree from
235          * the paging structure.
236          */
237         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
238                 pt = spte_to_child_pt(old_spte, level);
239                 sp = sptep_to_sp(pt);
240
241                 list_del(&sp->link);
242
243                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
244                         old_child_spte = READ_ONCE(*(pt + i));
245                         WRITE_ONCE(*(pt + i), 0);
246                         handle_changed_spte(kvm, as_id,
247                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
248                                 old_child_spte, 0, level - 1);
249                 }
250
251                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
252                                                    KVM_PAGES_PER_HPAGE(level));
253
254                 free_page((unsigned long)pt);
255                 kmem_cache_free(mmu_page_header_cache, sp);
256         }
257 }
258
259 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
260                                 u64 old_spte, u64 new_spte, int level)
261 {
262         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
263 }
264
265 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
266                                     u64 new_spte)
267 {
268         u64 *root_pt = tdp_iter_root_pt(iter);
269         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
270         int as_id = kvm_mmu_page_as_id(root);
271
272         *iter->sptep = new_spte;
273
274         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
275                             iter->level);
276 }
277
278 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
279         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
280
281 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
282         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
283                          _mmu->shadow_root_level, _start, _end)
284
285 /*
286  * Flush the TLB if the process should drop kvm->mmu_lock.
287  * Return whether the caller still needs to flush the tlb.
288  */
289 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
290 {
291         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
292                 kvm_flush_remote_tlbs(kvm);
293                 cond_resched_lock(&kvm->mmu_lock);
294                 tdp_iter_refresh_walk(iter);
295                 return false;
296         } else {
297                 return true;
298         }
299 }
300
301 /*
302  * Tears down the mappings for the range of gfns, [start, end), and frees the
303  * non-root pages mapping GFNs strictly within that range. Returns true if
304  * SPTEs have been cleared and a TLB flush is needed before releasing the
305  * MMU lock.
306  * If can_yield is true, will release the MMU lock and reschedule if the
307  * scheduler needs the CPU or there is contention on the MMU lock. If this
308  * function cannot yield, it will not release the MMU lock or reschedule and
309  * the caller must ensure it does not supply too large a GFN range, or the
310  * operation can cause a soft lockup.
311  */
312 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
313                           gfn_t start, gfn_t end, bool can_yield)
314 {
315         struct tdp_iter iter;
316         bool flush_needed = false;
317
318         tdp_root_for_each_pte(iter, root, start, end) {
319                 if (!is_shadow_present_pte(iter.old_spte))
320                         continue;
321
322                 /*
323                  * If this is a non-last-level SPTE that covers a larger range
324                  * than should be zapped, continue, and zap the mappings at a
325                  * lower level.
326                  */
327                 if ((iter.gfn < start ||
328                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
329                     !is_last_spte(iter.old_spte, iter.level))
330                         continue;
331
332                 tdp_mmu_set_spte(kvm, &iter, 0);
333
334                 if (can_yield)
335                         flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
336                 else
337                         flush_needed = true;
338         }
339         return flush_needed;
340 }
341
342 /*
343  * Tears down the mappings for the range of gfns, [start, end), and frees the
344  * non-root pages mapping GFNs strictly within that range. Returns true if
345  * SPTEs have been cleared and a TLB flush is needed before releasing the
346  * MMU lock.
347  */
348 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
349 {
350         struct kvm_mmu_page *root;
351         bool flush = false;
352
353         for_each_tdp_mmu_root(kvm, root) {
354                 /*
355                  * Take a reference on the root so that it cannot be freed if
356                  * this thread releases the MMU lock and yields in this loop.
357                  */
358                 kvm_mmu_get_root(kvm, root);
359
360                 flush |= zap_gfn_range(kvm, root, start, end, true);
361
362                 kvm_mmu_put_root(kvm, root);
363         }
364
365         return flush;
366 }
367
368 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
369 {
370         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
371         bool flush;
372
373         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
374         if (flush)
375                 kvm_flush_remote_tlbs(kvm);
376 }
377
378 /*
379  * Installs a last-level SPTE to handle a TDP page fault.
380  * (NPT/EPT violation/misconfiguration)
381  */
382 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
383                                           int map_writable,
384                                           struct tdp_iter *iter,
385                                           kvm_pfn_t pfn, bool prefault)
386 {
387         u64 new_spte;
388         int ret = 0;
389         int make_spte_ret = 0;
390
391         if (unlikely(is_noslot_pfn(pfn))) {
392                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
393                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
394         } else
395                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
396                                          pfn, iter->old_spte, prefault, true,
397                                          map_writable, !shadow_accessed_mask,
398                                          &new_spte);
399
400         if (new_spte == iter->old_spte)
401                 ret = RET_PF_SPURIOUS;
402         else
403                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
404
405         /*
406          * If the page fault was caused by a write but the page is write
407          * protected, emulation is needed. If the emulation was skipped,
408          * the vCPU would have the same fault again.
409          */
410         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
411                 if (write)
412                         ret = RET_PF_EMULATE;
413                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
414         }
415
416         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
417         if (unlikely(is_mmio_spte(new_spte)))
418                 ret = RET_PF_EMULATE;
419
420         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
421         if (!prefault)
422                 vcpu->stat.pf_fixed++;
423
424         return ret;
425 }
426
427 /*
428  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
429  * page tables and SPTEs to translate the faulting guest physical address.
430  */
431 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
432                     int map_writable, int max_level, kvm_pfn_t pfn,
433                     bool prefault)
434 {
435         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
436         bool write = error_code & PFERR_WRITE_MASK;
437         bool exec = error_code & PFERR_FETCH_MASK;
438         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
439         struct kvm_mmu *mmu = vcpu->arch.mmu;
440         struct tdp_iter iter;
441         struct kvm_mmu_page *sp;
442         u64 *child_pt;
443         u64 new_spte;
444         int ret;
445         gfn_t gfn = gpa >> PAGE_SHIFT;
446         int level;
447         int req_level;
448
449         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
450                 return RET_PF_RETRY;
451         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
452                 return RET_PF_RETRY;
453
454         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
455                                         huge_page_disallowed, &req_level);
456
457         trace_kvm_mmu_spte_requested(gpa, level, pfn);
458         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
459                 if (nx_huge_page_workaround_enabled)
460                         disallowed_hugepage_adjust(iter.old_spte, gfn,
461                                                    iter.level, &pfn, &level);
462
463                 if (iter.level == level)
464                         break;
465
466                 /*
467                  * If there is an SPTE mapping a large page at a higher level
468                  * than the target, that SPTE must be cleared and replaced
469                  * with a non-leaf SPTE.
470                  */
471                 if (is_shadow_present_pte(iter.old_spte) &&
472                     is_large_pte(iter.old_spte)) {
473                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
474
475                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
476                                         KVM_PAGES_PER_HPAGE(iter.level));
477
478                         /*
479                          * The iter must explicitly re-read the spte here
480                          * because the new value informs the !present
481                          * path below.
482                          */
483                         iter.old_spte = READ_ONCE(*iter.sptep);
484                 }
485
486                 if (!is_shadow_present_pte(iter.old_spte)) {
487                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
488                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
489                         child_pt = sp->spt;
490                         clear_page(child_pt);
491                         new_spte = make_nonleaf_spte(child_pt,
492                                                      !shadow_accessed_mask);
493
494                         trace_kvm_mmu_get_page(sp, true);
495                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
496                 }
497         }
498
499         if (WARN_ON(iter.level != level))
500                 return RET_PF_RETRY;
501
502         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
503                                               pfn, prefault);
504
505         return ret;
506 }
507
508 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
509                 unsigned long end, unsigned long data,
510                 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
511                                struct kvm_mmu_page *root, gfn_t start,
512                                gfn_t end, unsigned long data))
513 {
514         struct kvm_memslots *slots;
515         struct kvm_memory_slot *memslot;
516         struct kvm_mmu_page *root;
517         int ret = 0;
518         int as_id;
519
520         for_each_tdp_mmu_root(kvm, root) {
521                 /*
522                  * Take a reference on the root so that it cannot be freed if
523                  * this thread releases the MMU lock and yields in this loop.
524                  */
525                 kvm_mmu_get_root(kvm, root);
526
527                 as_id = kvm_mmu_page_as_id(root);
528                 slots = __kvm_memslots(kvm, as_id);
529                 kvm_for_each_memslot(memslot, slots) {
530                         unsigned long hva_start, hva_end;
531                         gfn_t gfn_start, gfn_end;
532
533                         hva_start = max(start, memslot->userspace_addr);
534                         hva_end = min(end, memslot->userspace_addr +
535                                       (memslot->npages << PAGE_SHIFT));
536                         if (hva_start >= hva_end)
537                                 continue;
538                         /*
539                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
540                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
541                          */
542                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
543                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
544
545                         ret |= handler(kvm, memslot, root, gfn_start,
546                                        gfn_end, data);
547                 }
548
549                 kvm_mmu_put_root(kvm, root);
550         }
551
552         return ret;
553 }
554
555 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
556                                      struct kvm_memory_slot *slot,
557                                      struct kvm_mmu_page *root, gfn_t start,
558                                      gfn_t end, unsigned long unused)
559 {
560         return zap_gfn_range(kvm, root, start, end, false);
561 }
562
563 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
564                               unsigned long end)
565 {
566         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
567                                             zap_gfn_range_hva_wrapper);
568 }