arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 static bool __read_mostly tdp_mmu_enabled = false;
  11
  12 static bool is_tdp_mmu_enabled(void)
  13 {
  14 #ifdef CONFIG_X86_64
  15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
  16 #else
  17         return false;
  18 #endif /* CONFIG_X86_64 */
  19 }
  20
  21 /* Initializes the TDP MMU for the VM, if enabled. */
  22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  23 {
  24         if (!is_tdp_mmu_enabled())
  25                 return;
  26
  27         /* This should not be changed for the lifetime of the VM. */
  28         kvm->arch.tdp_mmu_enabled = true;
  29
  30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  32 }
  33
  34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  35 {
  36         if (!kvm->arch.tdp_mmu_enabled)
  37                 return;
  38
  39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  40 }
  41
  42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
  43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  44
  45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
  46 {
  47         struct kvm_mmu_page *sp;
  48
  49         sp = to_shadow_page(hpa);
  50
  51         return sp->tdp_mmu_page && sp->root_count;
  52 }
  53
  54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  55                           gfn_t start, gfn_t end, bool can_yield);
  56
  57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  58 {
  59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
  60
  61         lockdep_assert_held(&kvm->mmu_lock);
  62
  63         WARN_ON(root->root_count);
  64         WARN_ON(!root->tdp_mmu_page);
  65
  66         list_del(&root->link);
  67
  68         zap_gfn_range(kvm, root, 0, max_gfn, false);
  69
  70         free_page((unsigned long)root->spt);
  71         kmem_cache_free(mmu_page_header_cache, root);
  72 }
  73
  74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
  75                                                    int level)
  76 {
  77         union kvm_mmu_page_role role;
  78
  79         role = vcpu->arch.mmu->mmu_role.base;
  80         role.level = level;
  81         role.direct = true;
  82         role.gpte_is_8_bytes = true;
  83         role.access = ACC_ALL;
  84
  85         return role;
  86 }
  87
  88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  89                                                int level)
  90 {
  91         struct kvm_mmu_page *sp;
  92
  93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
  94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
  95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  96
  97         sp->role.word = page_role_for_level(vcpu, level).word;
  98         sp->gfn = gfn;
  99         sp->tdp_mmu_page = true;
 100
 101         return sp;
 102 }
 103
 104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 105 {
 106         union kvm_mmu_page_role role;
 107         struct kvm *kvm = vcpu->kvm;
 108         struct kvm_mmu_page *root;
 109
 110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 111
 112         spin_lock(&kvm->mmu_lock);
 113
 114         /* Check for an existing root before allocating a new one. */
 115         for_each_tdp_mmu_root(kvm, root) {
 116                 if (root->role.word == role.word) {
 117                         kvm_mmu_get_root(kvm, root);
 118                         spin_unlock(&kvm->mmu_lock);
 119                         return root;
 120                 }
 121         }
 122
 123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 124         root->root_count = 1;
 125
 126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 127
 128         spin_unlock(&kvm->mmu_lock);
 129
 130         return root;
 131 }
 132
 133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 134 {
 135         struct kvm_mmu_page *root;
 136
 137         root = get_tdp_mmu_vcpu_root(vcpu);
 138         if (!root)
 139                 return INVALID_PAGE;
 140
 141         return __pa(root->spt);
 142 }
 143
 144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 145                                 u64 old_spte, u64 new_spte, int level);
 146
 147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 148 {
 149         return sp->role.smm ? 1 : 0;
 150 }
 151
 152 /**
 153  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 154  * @kvm: kvm instance
 155  * @as_id: the address space of the paging structure the SPTE was a part of
 156  * @gfn: the base GFN that was mapped by the SPTE
 157  * @old_spte: The value of the SPTE before the change
 158  * @new_spte: The value of the SPTE after the change
 159  * @level: the level of the PT the SPTE is part of in the paging structure
 160  *
 161  * Handle bookkeeping that might result from the modification of a SPTE.
 162  * This function must be called for all TDP SPTE modifications.
 163  */
 164 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 165                                 u64 old_spte, u64 new_spte, int level)
 166 {
 167         bool was_present = is_shadow_present_pte(old_spte);
 168         bool is_present = is_shadow_present_pte(new_spte);
 169         bool was_leaf = was_present && is_last_spte(old_spte, level);
 170         bool is_leaf = is_present && is_last_spte(new_spte, level);
 171         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 172         u64 *pt;
 173         struct kvm_mmu_page *sp;
 174         u64 old_child_spte;
 175         int i;
 176
 177         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 178         WARN_ON(level < PG_LEVEL_4K);
 179         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
 180
 181         /*
 182          * If this warning were to trigger it would indicate that there was a
 183          * missing MMU notifier or a race with some notifier handler.
 184          * A present, leaf SPTE should never be directly replaced with another
 185          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 186          * should be zapping the SPTE before the main MM's page table is
 187          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 188          * thread before replacement.
 189          */
 190         if (was_leaf && is_leaf && pfn_changed) {
 191                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 192                        "SPTE with another present leaf SPTE mapping a\n"
 193                        "different PFN!\n"
 194                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 195                        as_id, gfn, old_spte, new_spte, level);
 196
 197                 /*
 198                  * Crash the host to prevent error propagation and guest data
 199                  * courruption.
 200                  */
 201                 BUG();
 202         }
 203
 204         if (old_spte == new_spte)
 205                 return;
 206
 207         /*
 208          * The only times a SPTE should be changed from a non-present to
 209          * non-present state is when an MMIO entry is installed/modified/
 210          * removed. In that case, there is nothing to do here.
 211          */
 212         if (!was_present && !is_present) {
 213                 /*
 214                  * If this change does not involve a MMIO SPTE, it is
 215                  * unexpected. Log the change, though it should not impact the
 216                  * guest since both the former and current SPTEs are nonpresent.
 217                  */
 218                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
 219                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 220                                "should not be replaced with another,\n"
 221                                "different nonpresent SPTE, unless one or both\n"
 222                                "are MMIO SPTEs.\n"
 223                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 224                                as_id, gfn, old_spte, new_spte, level);
 225                 return;
 226         }
 227
 228
 229         if (was_leaf && is_dirty_spte(old_spte) &&
 230             (!is_dirty_spte(new_spte) || pfn_changed))
 231                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 232
 233         /*
 234          * Recursively handle child PTs if the change removed a subtree from
 235          * the paging structure.
 236          */
 237         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
 238                 pt = spte_to_child_pt(old_spte, level);
 239                 sp = sptep_to_sp(pt);
 240
 241                 list_del(&sp->link);
 242
 243                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 244                         old_child_spte = READ_ONCE(*(pt + i));
 245                         WRITE_ONCE(*(pt + i), 0);
 246                         handle_changed_spte(kvm, as_id,
 247                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
 248                                 old_child_spte, 0, level - 1);
 249                 }
 250
 251                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
 252                                                    KVM_PAGES_PER_HPAGE(level));
 253
 254                 free_page((unsigned long)pt);
 255                 kmem_cache_free(mmu_page_header_cache, sp);
 256         }
 257 }
 258
 259 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 260                                 u64 old_spte, u64 new_spte, int level)
 261 {
 262         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
 263 }
 264
 265 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 266                                     u64 new_spte)
 267 {
 268         u64 *root_pt = tdp_iter_root_pt(iter);
 269         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 270         int as_id = kvm_mmu_page_as_id(root);
 271
 272         *iter->sptep = new_spte;
 273
 274         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 275                             iter->level);
 276 }
 277
 278 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 279         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 280
 281 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 282         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 283                          _mmu->shadow_root_level, _start, _end)
 284
 285 /*
 286  * Flush the TLB if the process should drop kvm->mmu_lock.
 287  * Return whether the caller still needs to flush the tlb.
 288  */
 289 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 290 {
 291         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 292                 kvm_flush_remote_tlbs(kvm);
 293                 cond_resched_lock(&kvm->mmu_lock);
 294                 tdp_iter_refresh_walk(iter);
 295                 return false;
 296         } else {
 297                 return true;
 298         }
 299 }
 300
 301 /*
 302  * Tears down the mappings for the range of gfns, [start, end), and frees the
 303  * non-root pages mapping GFNs strictly within that range. Returns true if
 304  * SPTEs have been cleared and a TLB flush is needed before releasing the
 305  * MMU lock.
 306  * If can_yield is true, will release the MMU lock and reschedule if the
 307  * scheduler needs the CPU or there is contention on the MMU lock. If this
 308  * function cannot yield, it will not release the MMU lock or reschedule and
 309  * the caller must ensure it does not supply too large a GFN range, or the
 310  * operation can cause a soft lockup.
 311  */
 312 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 313                           gfn_t start, gfn_t end, bool can_yield)
 314 {
 315         struct tdp_iter iter;
 316         bool flush_needed = false;
 317
 318         tdp_root_for_each_pte(iter, root, start, end) {
 319                 if (!is_shadow_present_pte(iter.old_spte))
 320                         continue;
 321
 322                 /*
 323                  * If this is a non-last-level SPTE that covers a larger range
 324                  * than should be zapped, continue, and zap the mappings at a
 325                  * lower level.
 326                  */
 327                 if ((iter.gfn < start ||
 328                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 329                     !is_last_spte(iter.old_spte, iter.level))
 330                         continue;
 331
 332                 tdp_mmu_set_spte(kvm, &iter, 0);
 333
 334                 if (can_yield)
 335                         flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
 336                 else
 337                         flush_needed = true;
 338         }
 339         return flush_needed;
 340 }
 341
 342 /*
 343  * Tears down the mappings for the range of gfns, [start, end), and frees the
 344  * non-root pages mapping GFNs strictly within that range. Returns true if
 345  * SPTEs have been cleared and a TLB flush is needed before releasing the
 346  * MMU lock.
 347  */
 348 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 349 {
 350         struct kvm_mmu_page *root;
 351         bool flush = false;
 352
 353         for_each_tdp_mmu_root(kvm, root) {
 354                 /*
 355                  * Take a reference on the root so that it cannot be freed if
 356                  * this thread releases the MMU lock and yields in this loop.
 357                  */
 358                 kvm_mmu_get_root(kvm, root);
 359
 360                 flush |= zap_gfn_range(kvm, root, start, end, true);
 361
 362                 kvm_mmu_put_root(kvm, root);
 363         }
 364
 365         return flush;
 366 }
 367
 368 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 369 {
 370         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
 371         bool flush;
 372
 373         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 374         if (flush)
 375                 kvm_flush_remote_tlbs(kvm);
 376 }
 377
 378 /*
 379  * Installs a last-level SPTE to handle a TDP page fault.
 380  * (NPT/EPT violation/misconfiguration)
 381  */
 382 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 383                                           int map_writable,
 384                                           struct tdp_iter *iter,
 385                                           kvm_pfn_t pfn, bool prefault)
 386 {
 387         u64 new_spte;
 388         int ret = 0;
 389         int make_spte_ret = 0;
 390
 391         if (unlikely(is_noslot_pfn(pfn))) {
 392                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 393                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
 394         } else
 395                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 396                                          pfn, iter->old_spte, prefault, true,
 397                                          map_writable, !shadow_accessed_mask,
 398                                          &new_spte);
 399
 400         if (new_spte == iter->old_spte)
 401                 ret = RET_PF_SPURIOUS;
 402         else
 403                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
 404
 405         /*
 406          * If the page fault was caused by a write but the page is write
 407          * protected, emulation is needed. If the emulation was skipped,
 408          * the vCPU would have the same fault again.
 409          */
 410         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 411                 if (write)
 412                         ret = RET_PF_EMULATE;
 413                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 414         }
 415
 416         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 417         if (unlikely(is_mmio_spte(new_spte)))
 418                 ret = RET_PF_EMULATE;
 419
 420         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
 421         if (!prefault)
 422                 vcpu->stat.pf_fixed++;
 423
 424         return ret;
 425 }
 426
 427 /*
 428  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 429  * page tables and SPTEs to translate the faulting guest physical address.
 430  */
 431 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 432                     int map_writable, int max_level, kvm_pfn_t pfn,
 433                     bool prefault)
 434 {
 435         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 436         bool write = error_code & PFERR_WRITE_MASK;
 437         bool exec = error_code & PFERR_FETCH_MASK;
 438         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 439         struct kvm_mmu *mmu = vcpu->arch.mmu;
 440         struct tdp_iter iter;
 441         struct kvm_mmu_page *sp;
 442         u64 *child_pt;
 443         u64 new_spte;
 444         int ret;
 445         gfn_t gfn = gpa >> PAGE_SHIFT;
 446         int level;
 447         int req_level;
 448
 449         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 450                 return RET_PF_RETRY;
 451         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 452                 return RET_PF_RETRY;
 453
 454         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 455                                         huge_page_disallowed, &req_level);
 456
 457         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 458         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 459                 if (nx_huge_page_workaround_enabled)
 460                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 461                                                    iter.level, &pfn, &level);
 462
 463                 if (iter.level == level)
 464                         break;
 465
 466                 /*
 467                  * If there is an SPTE mapping a large page at a higher level
 468                  * than the target, that SPTE must be cleared and replaced
 469                  * with a non-leaf SPTE.
 470                  */
 471                 if (is_shadow_present_pte(iter.old_spte) &&
 472                     is_large_pte(iter.old_spte)) {
 473                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
 474
 475                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
 476                                         KVM_PAGES_PER_HPAGE(iter.level));
 477
 478                         /*
 479                          * The iter must explicitly re-read the spte here
 480                          * because the new value informs the !present
 481                          * path below.
 482                          */
 483                         iter.old_spte = READ_ONCE(*iter.sptep);
 484                 }
 485
 486                 if (!is_shadow_present_pte(iter.old_spte)) {
 487                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 488                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
 489                         child_pt = sp->spt;
 490                         clear_page(child_pt);
 491                         new_spte = make_nonleaf_spte(child_pt,
 492                                                      !shadow_accessed_mask);
 493
 494                         trace_kvm_mmu_get_page(sp, true);
 495                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
 496                 }
 497         }
 498
 499         if (WARN_ON(iter.level != level))
 500                 return RET_PF_RETRY;
 501
 502         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 503                                               pfn, prefault);
 504
 505         return ret;
 506 }
 507
 508 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
 509                 unsigned long end, unsigned long data,
 510                 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
 511                                struct kvm_mmu_page *root, gfn_t start,
 512                                gfn_t end, unsigned long data))
 513 {
 514         struct kvm_memslots *slots;
 515         struct kvm_memory_slot *memslot;
 516         struct kvm_mmu_page *root;
 517         int ret = 0;
 518         int as_id;
 519
 520         for_each_tdp_mmu_root(kvm, root) {
 521                 /*
 522                  * Take a reference on the root so that it cannot be freed if
 523                  * this thread releases the MMU lock and yields in this loop.
 524                  */
 525                 kvm_mmu_get_root(kvm, root);
 526
 527                 as_id = kvm_mmu_page_as_id(root);
 528                 slots = __kvm_memslots(kvm, as_id);
 529                 kvm_for_each_memslot(memslot, slots) {
 530                         unsigned long hva_start, hva_end;
 531                         gfn_t gfn_start, gfn_end;
 532
 533                         hva_start = max(start, memslot->userspace_addr);
 534                         hva_end = min(end, memslot->userspace_addr +
 535                                       (memslot->npages << PAGE_SHIFT));
 536                         if (hva_start >= hva_end)
 537                                 continue;
 538                         /*
 539                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 540                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 541                          */
 542                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 543                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 544
 545                         ret |= handler(kvm, memslot, root, gfn_start,
 546                                        gfn_end, data);
 547                 }
 548
 549                 kvm_mmu_put_root(kvm, root);
 550         }
 551
 552         return ret;
 553 }
 554
 555 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 556                                      struct kvm_memory_slot *slot,
 557                                      struct kvm_mmu_page *root, gfn_t start,
 558                                      gfn_t end, unsigned long unused)
 559 {
 560         return zap_gfn_range(kvm, root, start, end, false);
 561 }
 562
 563 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 564                               unsigned long end)
 565 {
 566         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 567                                             zap_gfn_range_hva_wrapper);
 568 }