sys/platform/vkernel/platform/pmap.c

   1 /*
   2  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
   3  * Copyright (c) 1991 Regents of the University of California.
   4  * All rights reserved.
   5  * Copyright (c) 1994 John S. Dyson
   6  * All rights reserved.
   7  * Copyright (c) 1994 David Greenman
   8  * All rights reserved.
   9  * Copyright (c) 2004-2006 Matthew Dillon
  10  * All rights reserved.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  *
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in
  20  *    the documentation and/or other materials provided with the
  21  *    distribution.
  22  * 3. Neither the name of The DragonFly Project nor the names of its
  23  *    contributors may be used to endorse or promote products derived
  24  *    from this software without specific, prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  28  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  29  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  30  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  31  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  32  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  33  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  34  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  35  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  36  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  37  * SUCH DAMAGE.
  38  *
  39  * from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  40  * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
  41  * $DragonFly: src/sys/platform/vkernel/platform/pmap.c,v 1.17 2007/02/24 14:24:06 corecode Exp $
  42  */
  43 /*
  44  * NOTE: PMAP_INVAL_ADD: In pc32 this function is called prior to adjusting
  45  * the PTE in the page table, because a cpu synchronization might be required.
  46  * The actual invalidation is delayed until the following call or flush.  In
  47  * the VKERNEL build this function is called prior to adjusting the PTE and
  48  * invalidates the table synchronously (not delayed), and is not SMP safe
  49  * as a consequence.
  50  */
  51
  52 #include <sys/types.h>
  53 #include <sys/systm.h>
  54 #include <sys/kernel.h>
  55 #include <sys/stat.h>
  56 #include <sys/mman.h>
  57 #include <sys/vkernel.h>
  58 #include <sys/proc.h>
  59 #include <sys/thread.h>
  60 #include <sys/user.h>
  61 #include <sys/vmspace.h>
  62
  63 #include <vm/pmap.h>
  64 #include <vm/vm_page.h>
  65 #include <vm/vm_extern.h>
  66 #include <vm/vm_kern.h>
  67 #include <vm/vm_object.h>
  68 #include <vm/vm_zone.h>
  69 #include <vm/vm_pageout.h>
  70
  71 #include <machine/md_var.h>
  72 #include <machine/pcb.h>
  73 #include <machine/pmap_inval.h>
  74 #include <machine/globaldata.h>
  75
  76 #include <assert.h>
  77
  78 struct pmap kernel_pmap;
  79
  80 static struct vm_zone pvzone;
  81 static struct vm_object pvzone_obj;
  82 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
  83 static int pv_entry_count;
  84 static int pv_entry_max;
  85 static int pv_entry_high_water;
  86 static int pmap_pagedaemon_waken;
  87 static boolean_t pmap_initialized = FALSE;
  88 static int protection_codes[8];
  89
  90 static void i386_protection_init(void);
  91 static void pmap_remove_all(vm_page_t m);
  92 static int pmap_release_free_page(struct pmap *pmap, vm_page_t p);
  93
  94 #define MINPV   2048
  95 #ifndef PMAP_SHPGPERPROC
  96 #define PMAP_SHPGPERPROC 200
  97 #endif
  98
  99 #define pmap_pde(m, v)  (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 100
 101 #define pte_prot(m, p) \
 102         (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
 103
 104 void
 105 pmap_init(void)
 106 {
 107         int i;
 108         struct pv_entry *pvinit;
 109
 110         for (i = 0; i < vm_page_array_size; i++) {
 111                 vm_page_t m;
 112
 113                 m = &vm_page_array[i];
 114                 TAILQ_INIT(&m->md.pv_list);
 115                 m->md.pv_list_count = 0;
 116         }
 117
 118         i = vm_page_array_size;
 119         if (i < MINPV)
 120                 i = MINPV;
 121         pvinit = (struct pv_entry *)kmem_alloc(&kernel_map, i*sizeof(*pvinit));
 122         zbootinit(&pvzone, "PV ENTRY", sizeof(*pvinit), pvinit, i);
 123         pmap_initialized = TRUE;
 124 }
 125
 126 void
 127 pmap_init2(void)
 128 {
 129         int shpgperproc = PMAP_SHPGPERPROC;
 130
 131         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 132         pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
 133         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 134         pv_entry_high_water = 9 * (pv_entry_max / 10);
 135         zinitna(&pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
 136 }
 137
 138 /*
 139  * Bootstrap the kernel_pmap so it can be used with pmap_enter().
 140  *
 141  * NOTE! pm_pdir for the kernel pmap is offset so VA's translate
 142  * directly into PTD indexes (PTA is also offset for the same reason).
 143  * This is necessary because, for now, KVA is not mapped at address 0.
 144  *
 145  * Page table pages are not managed like they are in normal pmaps, so
 146  * no pteobj is needed.
 147  */
 148 void
 149 pmap_bootstrap(void)
 150 {
 151         vm_pindex_t i = (vm_offset_t)KernelPTD >> PAGE_SHIFT;
 152
 153         kernel_pmap.pm_pdir = KernelPTD - (KvaStart >> SEG_SHIFT);
 154         kernel_pmap.pm_pdirpte = KernelPTA[i];
 155         kernel_pmap.pm_count = 1;
 156         kernel_pmap.pm_active = (cpumask_t)-1;
 157         TAILQ_INIT(&kernel_pmap.pm_pvlist);
 158         i386_protection_init();
 159 }
 160
 161 /*
 162  * Initialize pmap0/vmspace0 .  Since process 0 never enters user mode we
 163  * just dummy it up so it works well enough for fork().
 164  *
 165  * In DragonFly, process pmaps may only be used to manipulate user address
 166  * space, never kernel address space.
 167  */
 168 void
 169 pmap_pinit0(struct pmap *pmap)
 170 {
 171         pmap_pinit(pmap);
 172 }
 173
 174 /************************************************************************
 175  *              Procedures to manage whole physical maps                *
 176  ************************************************************************
 177  *
 178  * Initialize a preallocated and zeroed pmap structure,
 179  * such as one in a vmspace structure.
 180  */
 181 void
 182 pmap_pinit(struct pmap *pmap)
 183 {
 184         vm_page_t ptdpg;
 185         int npages;
 186
 187         /*
 188          * No need to allocate page table space yet but we do need a valid
 189          * page directory table.
 190          */
 191         if (pmap->pm_pdir == NULL) {
 192                 pmap->pm_pdir =
 193                     (pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
 194         }
 195
 196         /*
 197          * allocate object for the pte array and page directory
 198          */
 199         npages = VPTE_PAGETABLE_SIZE +
 200                  (VM_MAX_USER_ADDRESS / PAGE_SIZE) * sizeof(vpte_t);
 201         npages = (npages + PAGE_MASK) / PAGE_SIZE;
 202
 203         if (pmap->pm_pteobj == NULL)
 204                 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, npages);
 205         pmap->pm_pdindex = npages - 1;
 206
 207         /*
 208          * allocate the page directory page
 209          */
 210         ptdpg = vm_page_grab(pmap->pm_pteobj, pmap->pm_pdindex,
 211                              VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 212
 213         ptdpg->wire_count = 1;
 214         ++vmstats.v_wire_count;
 215
 216         /* not usually mapped */
 217         vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY);
 218         ptdpg->valid = VM_PAGE_BITS_ALL;
 219
 220         pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
 221         pmap->pm_pdirpte = KernelPTA[(vm_offset_t)pmap->pm_pdir >> PAGE_SHIFT];
 222         if ((ptdpg->flags & PG_ZERO) == 0)
 223                 bzero(pmap->pm_pdir, PAGE_SIZE);
 224
 225         pmap->pm_count = 1;
 226         pmap->pm_active = 0;
 227         pmap->pm_ptphint = NULL;
 228         TAILQ_INIT(&pmap->pm_pvlist);
 229         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 230 }
 231
 232 /*
 233  * Wire in kernel global address entries.  To avoid a race condition
 234  * between pmap initialization and pmap_growkernel, this procedure
 235  * adds the pmap to the master list (which growkernel scans to update),
 236  * then copies the template.
 237  *
 238  * In a virtual kernel there are no kernel global address entries.
 239  */
 240 void
 241 pmap_pinit2(struct pmap *pmap)
 242 {
 243         crit_enter();
 244         TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
 245         crit_exit();
 246 }
 247
 248 /*
 249  * Release all resources held by the given physical map.
 250  *
 251  * Should only be called if the map contains no valid mappings.
 252  */
 253 static int pmap_release_callback(struct vm_page *p, void *data);
 254
 255 void
 256 pmap_release(struct pmap *pmap)
 257 {
 258         struct mdglobaldata *gd = mdcpu;
 259         vm_object_t object = pmap->pm_pteobj;
 260         struct rb_vm_page_scan_info info;
 261
 262         KKASSERT(pmap != &kernel_pmap);
 263
 264 #if defined(DIAGNOSTIC)
 265         if (object->ref_count != 1)
 266                 panic("pmap_release: pteobj reference count != 1");
 267 #endif
 268 #ifdef SMP
 269 #error "Must write code to clear PTxpdir cache across all CPUs"
 270 #endif
 271         /*
 272          * Once we destroy the page table, the mapping becomes invalid.
 273          * Rather then waste time doing a madvise
 274          */
 275         if (pmap->pm_pdir == gd->gd_PT1pdir) {
 276                 gd->gd_PT1pdir = NULL;
 277                 *gd->gd_PT1pde = 0;
 278                 /* madvise(gd->gd_PT1map, SEG_SIZE, MADV_INVAL); */
 279         }
 280         if (pmap->pm_pdir == gd->gd_PT2pdir) {
 281                 gd->gd_PT2pdir = NULL;
 282                 *gd->gd_PT2pde = 0;
 283                 /* madvise(gd->gd_PT2map, SEG_SIZE, MADV_INVAL); */
 284         }
 285
 286         info.pmap = pmap;
 287         info.object = object;
 288         crit_enter();
 289         TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
 290         crit_exit();
 291
 292         do {
 293                 crit_enter();
 294                 info.error = 0;
 295                 info.mpte = NULL;
 296                 info.limit = object->generation;
 297
 298                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
 299                                         pmap_release_callback, &info);
 300                 if (info.error == 0 && info.mpte) {
 301                         if (!pmap_release_free_page(pmap, info.mpte))
 302                                 info.error = 1;
 303                 }
 304                 crit_exit();
 305         } while (info.error);
 306
 307         /*
 308          * Leave the KVA reservation for pm_pdir cached for later reuse.
 309          */
 310         pmap->pm_pdirpte = 0;
 311 }
 312
 313 static int
 314 pmap_release_callback(struct vm_page *p, void *data)
 315 {
 316         struct rb_vm_page_scan_info *info = data;
 317
 318         if (p->pindex == info->pmap->pm_pdindex) {
 319                 info->mpte = p;
 320                 return(0);
 321         }
 322         if (!pmap_release_free_page(info->pmap, p)) {
 323                 info->error = 1;
 324                 return(-1);
 325         }
 326         if (info->object->generation != info->limit) {
 327                 info->error = 1;
 328                 return(-1);
 329         }
 330         return(0);
 331 }
 332
 333 /*
 334  * Retire the given physical map from service.  Should only be called if
 335  * the map contains no valid mappings.
 336  */
 337 void
 338 pmap_destroy(pmap_t pmap)
 339 {
 340         int count;
 341
 342         if (pmap == NULL)
 343                 return;
 344
 345         count = --pmap->pm_count;
 346         if (count == 0) {
 347                 pmap_release(pmap);
 348                 panic("destroying a pmap is not yet implemented");
 349         }
 350 }
 351
 352 /*
 353  * Add a reference to the specified pmap.
 354  */
 355 void
 356 pmap_reference(pmap_t pmap)
 357 {
 358         if (pmap != NULL) {
 359                 pmap->pm_count++;
 360         }
 361 }
 362
 363 /************************************************************************
 364  *                      VMSPACE MANAGEMENT                              *
 365  ************************************************************************
 366  *
 367  * The VMSPACE management we do in our virtual kernel must be reflected
 368  * in the real kernel.  This is accomplished by making vmspace system
 369  * calls to the real kernel.
 370  */
 371 void
 372 cpu_vmspace_alloc(struct vmspace *vm)
 373 {
 374         int r;
 375         void *rp;
 376
 377 #define LAST_EXTENT     (VM_MAX_USER_ADDRESS - 0x80000000)
 378
 379         if (vmspace_create(&vm->vm_pmap, 0, NULL) < 0)
 380                 panic("vmspace_create() failed");
 381
 382         rp = vmspace_mmap(&vm->vm_pmap, (void *)0x00000000, 0x40000000,
 383                           PROT_READ|PROT_WRITE,
 384                           MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED,
 385                           MemImageFd, 0);
 386         if (rp == MAP_FAILED)
 387                 panic("vmspace_mmap: failed1");
 388         vmspace_mcontrol(&vm->vm_pmap, (void *)0x00000000, 0x40000000,
 389                          MADV_NOSYNC, 0);
 390         rp = vmspace_mmap(&vm->vm_pmap, (void *)0x40000000, 0x40000000,
 391                           PROT_READ|PROT_WRITE,
 392                           MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED,
 393                           MemImageFd, 0x40000000);
 394         if (rp == MAP_FAILED)
 395                 panic("vmspace_mmap: failed2");
 396         vmspace_mcontrol(&vm->vm_pmap, (void *)0x40000000, 0x40000000,
 397                          MADV_NOSYNC, 0);
 398         rp = vmspace_mmap(&vm->vm_pmap, (void *)0x80000000, LAST_EXTENT,
 399                           PROT_READ|PROT_WRITE,
 400                           MAP_FILE|MAP_SHARED|MAP_VPAGETABLE|MAP_FIXED,
 401                           MemImageFd, 0x80000000);
 402         vmspace_mcontrol(&vm->vm_pmap, (void *)0x80000000, LAST_EXTENT,
 403                          MADV_NOSYNC, 0);
 404         if (rp == MAP_FAILED)
 405                 panic("vmspace_mmap: failed3");
 406
 407         r = vmspace_mcontrol(&vm->vm_pmap, (void *)0x00000000, 0x40000000,
 408                              MADV_SETMAP, vmspace_pmap(vm)->pm_pdirpte);
 409         if (r < 0)
 410                 panic("vmspace_mcontrol: failed1");
 411         r = vmspace_mcontrol(&vm->vm_pmap, (void *)0x40000000, 0x40000000,
 412                              MADV_SETMAP, vmspace_pmap(vm)->pm_pdirpte);
 413         if (r < 0)
 414                 panic("vmspace_mcontrol: failed2");
 415         r = vmspace_mcontrol(&vm->vm_pmap, (void *)0x80000000, LAST_EXTENT,
 416                              MADV_SETMAP, vmspace_pmap(vm)->pm_pdirpte);
 417         if (r < 0)
 418                 panic("vmspace_mcontrol: failed3");
 419 }
 420
 421 void
 422 cpu_vmspace_free(struct vmspace *vm)
 423 {
 424         if (vmspace_destroy(&vm->vm_pmap) < 0)
 425                 panic("vmspace_destroy() failed");
 426 }
 427
 428 /************************************************************************
 429  *          Procedures which operate directly on the kernel PMAP        *
 430  ************************************************************************/
 431
 432 /*
 433  * This maps the requested page table and gives us access to it.
 434  */
 435 static vpte_t *
 436 get_ptbase(struct pmap *pmap, vm_offset_t va)
 437 {
 438         struct mdglobaldata *gd = mdcpu;
 439
 440         if (pmap == &kernel_pmap) {
 441                 KKASSERT(va >= KvaStart && va < KvaEnd);
 442                 return(KernelPTA + (va >> PAGE_SHIFT));
 443         } else if (pmap->pm_pdir == gd->gd_PT1pdir) {
 444                 return(gd->gd_PT1map + (va >> PAGE_SHIFT));
 445         } else if (pmap->pm_pdir == gd->gd_PT2pdir) {
 446                 return(gd->gd_PT2map + (va >> PAGE_SHIFT));
 447         }
 448
 449         /*
 450          * Otherwise choose one or the other and map the page table
 451          * in the KVA space reserved for it.
 452          */
 453         KKASSERT(gd->mi.gd_intr_nesting_level == 0 &&
 454                  (gd->mi.gd_curthread->td_flags & TDF_INTTHREAD) == 0);
 455
 456         if ((gd->gd_PTflip = 1 - gd->gd_PTflip) == 0) {
 457                 gd->gd_PT1pdir = pmap->pm_pdir;
 458                 *gd->gd_PT1pde = pmap->pm_pdirpte;
 459                 madvise(gd->gd_PT1map, SEG_SIZE, MADV_INVAL);
 460                 return(gd->gd_PT1map + (va >> PAGE_SHIFT));
 461         } else {
 462                 gd->gd_PT2pdir = pmap->pm_pdir;
 463                 *gd->gd_PT2pde = pmap->pm_pdirpte;
 464                 madvise(gd->gd_PT2map, SEG_SIZE, MADV_INVAL);
 465                 return(gd->gd_PT2map + (va >> PAGE_SHIFT));
 466         }
 467 }
 468
 469 static vpte_t *
 470 get_ptbase1(struct pmap *pmap, vm_offset_t va)
 471 {
 472         struct mdglobaldata *gd = mdcpu;
 473
 474         if (pmap == &kernel_pmap) {
 475                 KKASSERT(va >= KvaStart && va < KvaEnd);
 476                 return(KernelPTA + (va >> PAGE_SHIFT));
 477         } else if (pmap->pm_pdir == gd->gd_PT1pdir) {
 478                 return(gd->gd_PT1map + (va >> PAGE_SHIFT));
 479         }
 480         KKASSERT(gd->mi.gd_intr_nesting_level == 0 &&
 481                  (gd->mi.gd_curthread->td_flags & TDF_INTTHREAD) == 0);
 482         gd->gd_PT1pdir = pmap->pm_pdir;
 483         *gd->gd_PT1pde = pmap->pm_pdirpte;
 484         madvise(gd->gd_PT1map, SEG_SIZE, MADV_INVAL);
 485         return(gd->gd_PT1map + (va >> PAGE_SHIFT));
 486 }
 487
 488 static vpte_t *
 489 get_ptbase2(struct pmap *pmap, vm_offset_t va)
 490 {
 491         struct mdglobaldata *gd = mdcpu;
 492
 493         if (pmap == &kernel_pmap) {
 494                 KKASSERT(va >= KvaStart && va < KvaEnd);
 495                 return(KernelPTA + (va >> PAGE_SHIFT));
 496         } else if (pmap->pm_pdir == gd->gd_PT2pdir) {
 497                 return(gd->gd_PT2map + (va >> PAGE_SHIFT));
 498         }
 499         KKASSERT(gd->mi.gd_intr_nesting_level == 0 &&
 500                  (gd->mi.gd_curthread->td_flags & TDF_INTTHREAD) == 0);
 501         gd->gd_PT2pdir = pmap->pm_pdir;
 502         *gd->gd_PT2pde = pmap->pm_pdirpte;
 503         madvise(gd->gd_PT2map, SEG_SIZE, MADV_INVAL);
 504         return(gd->gd_PT2map + (va >> PAGE_SHIFT));
 505 }
 506
 507 /*
 508  * When removing a page directory the related VA range in the self-mapping
 509  * of the page table must be invalidated.
 510  */
 511 static void
 512 inval_ptbase_pagedir(pmap_t pmap, vm_pindex_t pindex)
 513 {
 514         struct mdglobaldata *gd = mdcpu;
 515         vm_offset_t va;
 516
 517 #ifdef SMP
 518 #error "Must inval self-mappings in all gd's"
 519 #endif
 520         if (pmap == &kernel_pmap) {
 521                 va = (vm_offset_t)KernelPTA + (pindex << PAGE_SHIFT);
 522                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 523         } else {
 524                 /*
 525                  * XXX this should not strictly be needed because the page
 526                  * dir should alread be invalidated.  test and remove
 527                  */
 528                 va = (vm_offset_t)pindex << PAGE_SHIFT;
 529                 vmspace_mcontrol(pmap, (void *)va, SEG_SIZE, MADV_INVAL, 0);
 530         }
 531         if (pmap->pm_pdir == gd->gd_PT1pdir) {
 532                 va = (vm_offset_t)gd->gd_PT1map + (pindex << PAGE_SHIFT);
 533                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 534         }
 535         if (pmap->pm_pdir == gd->gd_PT2pdir) {
 536                 va = (vm_offset_t)gd->gd_PT2map + (pindex << PAGE_SHIFT);
 537                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 538         }
 539 }
 540
 541 /*
 542  * Return a pointer to the page table entry for the specified va in the
 543  * specified pmap.  NULL is returned if there is no valid page table page
 544  * for the VA.
 545  */
 546 static __inline vpte_t *
 547 pmap_pte(struct pmap *pmap, vm_offset_t va)
 548 {
 549         vpte_t *ptep;
 550
 551         ptep = &pmap->pm_pdir[va >> SEG_SHIFT];
 552         if (*ptep & VPTE_PS)
 553                 return(ptep);
 554         if (*ptep)
 555                 return (get_ptbase(pmap, va));
 556         return(NULL);
 557 }
 558
 559
 560 /*
 561  * Enter a mapping into kernel_pmap.  Mappings created in this fashion
 562  * are not managed.
 563  */
 564 void
 565 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 566 {
 567         vpte_t *ptep;
 568         vpte_t npte;
 569 #ifdef SMP
 570         pmap_inval_info info;
 571 #endif
 572
 573         KKASSERT(va >= KvaStart && va < KvaEnd);
 574         npte = (vpte_t)pa | VPTE_R | VPTE_W | VPTE_V;
 575         ptep = KernelPTA + (va >> PAGE_SHIFT);
 576         if (*ptep & VPTE_V) {
 577 #ifdef SMP
 578                 pmap_inval_init(&info);
 579                 pmap_inval_add(&info, &kernel_pmap, va);
 580 #endif
 581                 *ptep = npte;
 582 #ifdef SMP
 583                 pmap_inval_flush(&info);
 584 #else
 585                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 586 #endif
 587         } else {
 588                 *ptep = npte;
 589         }
 590 }
 591
 592 void
 593 pmap_kenter_sync(vm_offset_t va)
 594 {
 595         pmap_inval_info info;
 596
 597         pmap_inval_init(&info);
 598         pmap_inval_add(&info, &kernel_pmap, va);
 599         pmap_inval_flush(&info);
 600 }
 601
 602 void
 603 pmap_kenter_sync_quick(vm_offset_t va)
 604 {
 605         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 606 }
 607
 608 /*
 609  * XXX these need to be recoded.  They are not used in any critical path.
 610  */
 611 void
 612 pmap_kmodify_rw(vm_offset_t va)
 613 {
 614         *pmap_kpte(va) |= VPTE_R | VPTE_W;
 615         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 616 }
 617
 618 void
 619 pmap_kmodify_nc(vm_offset_t va)
 620 {
 621 #if 0
 622         *pmap_kpte(va) |= VPTE_N;
 623         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 624 #endif
 625 }
 626
 627 /*
 628  * Map a contiguous range of physical memory to a KVM
 629  */
 630 vm_offset_t
 631 pmap_map(vm_offset_t virt, vm_paddr_t start, vm_paddr_t end, int prot)
 632 {
 633         while (start < end) {
 634                 pmap_kenter(virt, start);
 635                 virt += PAGE_SIZE;
 636                 start += PAGE_SIZE;
 637         }
 638         return (virt);
 639 }
 640
 641 vpte_t *
 642 pmap_kpte(vm_offset_t va)
 643 {
 644         vpte_t *ptep;
 645
 646         KKASSERT(va >= KvaStart && va < KvaEnd);
 647         ptep = KernelPTA + (va >> PAGE_SHIFT);
 648         return(ptep);
 649 }
 650
 651 /*
 652  * Enter a mapping into kernel_pmap without any SMP interactions.
 653  *
 654  * Mappings created in this fashion are not managed.
 655  */
 656 void
 657 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
 658 {
 659         vpte_t *ptep;
 660         vpte_t npte;
 661
 662         KKASSERT(va >= KvaStart && va < KvaEnd);
 663
 664         npte = (vpte_t)pa | VPTE_R | VPTE_W | VPTE_V;
 665         ptep = KernelPTA + (va >> PAGE_SHIFT);
 666         if (*ptep & VPTE_V) {
 667                 *ptep = npte;
 668                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 669         } else {
 670                 *ptep = npte;
 671         }
 672 }
 673
 674 /*
 675  * Make a temporary mapping for a physical address.  This is only intended
 676  * to be used for panic dumps.
 677  */
 678 void *
 679 pmap_kenter_temporary(vm_paddr_t pa, int i)
 680 {
 681         pmap_kenter(crashdumpmap + (i * PAGE_SIZE), pa);
 682         return ((void *)crashdumpmap);
 683 }
 684
 685 /*
 686  * Remove an unmanaged mapping created with pmap_kenter*().
 687  */
 688 void
 689 pmap_kremove(vm_offset_t va)
 690 {
 691         vpte_t *ptep;
 692 #ifdef SMP
 693         pmap_inval_info info;
 694 #endif
 695
 696         KKASSERT(va >= KvaStart && va < KvaEnd);
 697
 698         ptep = KernelPTA + (va >> PAGE_SHIFT);
 699         if (*ptep & VPTE_V) {
 700 #ifdef SMP
 701                 pmap_inval_init(&info);
 702                 pmap_inval_add(&info, &kernel_pmap, va);
 703 #endif
 704                 *ptep = 0;
 705 #ifdef SMP
 706                 pmap_inval_flush(&info);
 707 #else
 708                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 709 #endif
 710         } else {
 711                 *ptep = 0;
 712         }
 713
 714 }
 715
 716 /*
 717  * Remove an unmanaged mapping created with pmap_kenter*() without
 718  * going through any SMP interactions.
 719  */
 720 void
 721 pmap_kremove_quick(vm_offset_t va)
 722 {
 723         vpte_t *ptep;
 724
 725         KKASSERT(va >= KvaStart && va < KvaEnd);
 726
 727         ptep = KernelPTA + (va >> PAGE_SHIFT);
 728         if (*ptep & VPTE_V) {
 729                 *ptep = 0;
 730                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 731         } else {
 732                 *ptep = 0;
 733         }
 734 }
 735
 736 /*
 737  * Extract the physical address from the kernel_pmap that is associated
 738  * with the specified virtual address.
 739  */
 740 vm_paddr_t
 741 pmap_kextract(vm_offset_t va)
 742 {
 743         vpte_t *ptep;
 744         vm_paddr_t pa;
 745
 746         KKASSERT(va >= KvaStart && va < KvaEnd);
 747
 748         ptep = KernelPTA + (va >> PAGE_SHIFT);
 749         pa = (vm_paddr_t)(*ptep & VPTE_FRAME) | (va & PAGE_MASK);
 750         return(pa);
 751 }
 752
 753 /*
 754  * Map a set of unmanaged VM pages into KVM.
 755  */
 756 void
 757 pmap_qenter(vm_offset_t va, struct vm_page **m, int count)
 758 {
 759         KKASSERT(va >= KvaStart && va + count * PAGE_SIZE < KvaEnd);
 760         while (count) {
 761                 vpte_t *ptep;
 762
 763                 ptep = KernelPTA + (va >> PAGE_SHIFT);
 764                 if (*ptep & VPTE_V)
 765                         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 766                 *ptep = (vpte_t)(*m)->phys_addr | VPTE_R | VPTE_W | VPTE_V;
 767                 --count;
 768                 ++m;
 769                 va += PAGE_SIZE;
 770         }
 771 #ifdef SMP
 772         XXX
 773         smp_invltlb();
 774 #endif
 775 }
 776
 777 /*
 778  * Map a set of VM pages to kernel virtual memory.  If a mapping changes
 779  * clear the supplied mask.  The caller handles any SMP interactions.
 780  * The mask is used to provide the caller with hints on what SMP interactions
 781  * might be needed.
 782  */
 783 void
 784 pmap_qenter2(vm_offset_t va, struct vm_page **m, int count, cpumask_t *mask)
 785 {
 786         cpumask_t cmask = mycpu->gd_cpumask;
 787
 788         KKASSERT(va >= KvaStart && va + count * PAGE_SIZE < KvaEnd);
 789         while (count) {
 790                 vpte_t *ptep;
 791                 vpte_t npte;
 792
 793                 ptep = KernelPTA + (va >> PAGE_SHIFT);
 794                 npte = (vpte_t)(*m)->phys_addr | VPTE_R | VPTE_W | VPTE_V;
 795                 if (*ptep != npte) {
 796                         *mask = 0;
 797                         *ptep = npte;
 798                         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 799                 } else if ((*mask & cmask) == 0) {
 800                         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 801                 }
 802                 --count;
 803                 ++m;
 804                 va += PAGE_SIZE;
 805         }
 806         *mask |= cmask;
 807 }
 808
 809 /*
 810  * Undo the effects of pmap_qenter*().
 811  */
 812 void
 813 pmap_qremove(vm_offset_t va, int count)
 814 {
 815         KKASSERT(va >= KvaStart && va + count * PAGE_SIZE < KvaEnd);
 816         while (count) {
 817                 vpte_t *ptep;
 818
 819                 ptep = KernelPTA + (va >> PAGE_SHIFT);
 820                 if (*ptep & VPTE_V)
 821                         madvise((void *)va, PAGE_SIZE, MADV_INVAL);
 822                 *ptep = 0;
 823                 --count;
 824                 va += PAGE_SIZE;
 825         }
 826 #ifdef SMP
 827         XXX
 828         smp_invltlb();
 829 #endif
 830 }
 831
 832 /************************************************************************
 833  *        Misc support glue called by machine independant code          *
 834  ************************************************************************
 835  *
 836  * These routines are called by machine independant code to operate on
 837  * certain machine-dependant aspects of processes, threads, and pmaps.
 838  */
 839
 840 /*
 841  * Initialize MD portions of the thread structure.
 842  */
 843 void
 844 pmap_init_thread(thread_t td)
 845 {
 846         /* enforce pcb placement */
 847         td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
 848         td->td_savefpu = &td->td_pcb->pcb_save;
 849         td->td_sp = (char *)td->td_pcb - 16;
 850 }
 851
 852 /*
 853  * Initialize MD portions of a process structure. XXX this aint MD
 854  */
 855 void
 856 pmap_init_proc(struct proc *p, struct thread *td)
 857 {
 858         struct lwp *lp = ONLY_LWP_IN_PROC(p);
 859
 860         p->p_addr = (void *)td->td_kstack;
 861         lp->lwp_thread = td;
 862         td->td_proc = p;
 863         td->td_lwp = lp;
 864         td->td_switch = cpu_heavy_switch;
 865 #ifdef SMP
 866         KKASSERT(td->td_mpcount == 1);
 867 #endif
 868         bzero(p->p_addr, sizeof(*p->p_addr));
 869 }
 870
 871 /*
 872  * Destroy the UPAGES for a process that has exited and disassociate
 873  * the process from its thread.
 874  */
 875 void
 876 pmap_dispose_proc(struct proc *p)
 877 {
 878         KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p));
 879
 880         lwp_dispose(ONLY_LWP_IN_PROC(p));
 881         p->p_addr = NULL;
 882 }
 883
 884 /*
 885  * We pre-allocate all page table pages for kernel virtual memory so
 886  * this routine will only be called if KVM has been exhausted.
 887  */
 888 void
 889 pmap_growkernel(vm_offset_t addr)
 890 {
 891         addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 892
 893         if (addr > virtual_end - SEG_SIZE)
 894                 panic("KVM exhausted");
 895         kernel_vm_end = addr;
 896 }
 897
 898 /*
 899  * The modification bit is not tracked for any pages in this range. XXX
 900  * such pages in this maps should always use pmap_k*() functions and not
 901  * be managed anyhow.
 902  *
 903  * XXX User and kernel address spaces are independant for virtual kernels,
 904  * this function only applies to the kernel pmap.
 905  */
 906 static int
 907 pmap_track_modified(pmap_t pmap, vm_offset_t va)
 908 {
 909         if (pmap != &kernel_pmap)
 910                 return 1;
 911         if ((va < clean_sva) || (va >= clean_eva))
 912                 return 1;
 913         else
 914                 return 0;
 915 }
 916
 917 /************************************************************************
 918  *          Procedures supporting managed page table pages              *
 919  ************************************************************************
 920  *
 921  * These procedures are used to track managed page table pages.  These pages
 922  * use the page table page's vm_page_t to track PTEs in the page.  The
 923  * page table pages themselves are arranged in a VM object, pmap->pm_pteobj.
 924  *
 925  * This allows the system to throw away page table pages for user processes
 926  * at will and reinstantiate them on demand.
 927  */
 928
 929 /*
 930  * This routine works like vm_page_lookup() but also blocks as long as the
 931  * page is busy.  This routine does not busy the page it returns.
 932  *
 933  * Unless the caller is managing objects whos pages are in a known state,
 934  * the call should be made with a critical section held so the page's object
 935  * association remains valid on return.
 936  */
 937 static vm_page_t
 938 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 939 {
 940         vm_page_t m;
 941
 942 retry:
 943         m = vm_page_lookup(object, pindex);
 944         if (m && vm_page_sleep_busy(m, FALSE, "pplookp"))
 945                 goto retry;
 946         return(m);
 947 }
 948
 949 /*
 950  * This routine unholds page table pages, and if the hold count
 951  * drops to zero, then it decrements the wire count.
 952  */
 953 static int
 954 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
 955 {
 956         pmap_inval_flush(info);
 957         while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
 958                 ;
 959
 960         if (m->hold_count == 0) {
 961                 /*
 962                  * unmap the page table page
 963                  */
 964                 pmap->pm_pdir[m->pindex] = 0;
 965                 --pmap->pm_stats.resident_count;
 966                 inval_ptbase_pagedir(pmap, m->pindex);
 967
 968                 if (pmap->pm_ptphint == m)
 969                         pmap->pm_ptphint = NULL;
 970
 971                 /*
 972                  * If the page is finally unwired, simply free it.
 973                  */
 974                 --m->wire_count;
 975                 if (m->wire_count == 0) {
 976                         vm_page_flash(m);
 977                         vm_page_busy(m);
 978                         vm_page_free_zero(m);
 979                         --vmstats.v_wire_count;
 980                 }
 981                 return 1;
 982         }
 983         return 0;
 984 }
 985
 986 static __inline int
 987 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
 988 {
 989         vm_page_unhold(m);
 990         if (m->hold_count == 0)
 991                 return _pmap_unwire_pte_hold(pmap, m, info);
 992         else
 993                 return 0;
 994 }
 995
 996 /*
 997  * After removing a page table entry, this routine is used to
 998  * conditionally free the page, and manage the hold/wire counts.
 999  */
1000 static int
1001 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
1002                 pmap_inval_info_t info)
1003 {
1004         unsigned ptepindex;
1005
1006         if (mpte == NULL) {
1007                 /*
1008                  * page table pages in the kernel_pmap are not managed.
1009                  */
1010                 if (pmap == &kernel_pmap)
1011                         return(0);
1012                 ptepindex = (va >> PDRSHIFT);
1013                 if (pmap->pm_ptphint &&
1014                         (pmap->pm_ptphint->pindex == ptepindex)) {
1015                         mpte = pmap->pm_ptphint;
1016                 } else {
1017                         pmap_inval_flush(info);
1018                         mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1019                         pmap->pm_ptphint = mpte;
1020                 }
1021         }
1022         return pmap_unwire_pte_hold(pmap, mpte, info);
1023 }
1024
1025 /*
1026  * Attempt to release and free an vm_page in a pmap.  Returns 1 on success,
1027  * 0 on failure (if the procedure had to sleep).
1028  */
1029 static int
1030 pmap_release_free_page(struct pmap *pmap, vm_page_t p)
1031 {
1032         vpte_t *pde = pmap->pm_pdir;
1033         /*
1034          * This code optimizes the case of freeing non-busy
1035          * page-table pages.  Those pages are zero now, and
1036          * might as well be placed directly into the zero queue.
1037          */
1038         if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1039                 return 0;
1040
1041         vm_page_busy(p);
1042         pmap->pm_stats.resident_count--;
1043
1044         if (p->hold_count)  {
1045                 panic("pmap_release: freeing held page table page");
1046         }
1047         /*
1048          * Page directory pages need to have the kernel stuff cleared, so
1049          * they can go into the zero queue also.
1050          *
1051          * In virtual kernels there is no 'kernel stuff'.  For the moment
1052          * I just make sure the whole thing has been zero'd even though
1053          * it should already be completely zero'd.
1054          *
1055          * pmaps for vkernels do not self-map because they do not share
1056          * their address space with the vkernel.  Clearing of pde[] thus
1057          * only applies to page table pages and not to the page directory
1058          * page.
1059          */
1060         if (p->pindex == pmap->pm_pdindex) {
1061                 bzero(pde, VPTE_PAGETABLE_SIZE);
1062                 pmap_kremove((vm_offset_t)pmap->pm_pdir);
1063         } else {
1064                 pde[p->pindex] = 0;
1065         }
1066
1067         /*
1068          * Clear the matching hint
1069          */
1070         if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1071                 pmap->pm_ptphint = NULL;
1072
1073         /*
1074          * And throw the page away.  The page is completely zero'd out so
1075          * optimize the free call.
1076          */
1077         p->wire_count--;
1078         vmstats.v_wire_count--;
1079         vm_page_free_zero(p);
1080         return 1;
1081 }
1082
1083 /*
1084  * This routine is called if the page table page is not mapped in the page
1085  * table directory.
1086  *
1087  * The routine is broken up into two parts for readability.
1088  */
1089 static vm_page_t
1090 _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
1091 {
1092         vm_paddr_t ptepa;
1093         vm_page_t m;
1094
1095         /*
1096          * Find or fabricate a new pagetable page
1097          */
1098         m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1099                          VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1100
1101         KASSERT(m->queue == PQ_NONE,
1102                 ("_pmap_allocpte: %p->queue != PQ_NONE", m));
1103
1104         if (m->wire_count == 0)
1105                 vmstats.v_wire_count++;
1106         m->wire_count++;
1107
1108         /*
1109          * Increment the hold count for the page table page
1110          * (denoting a new mapping.)
1111          */
1112         m->hold_count++;
1113
1114         /*
1115          * Map the pagetable page into the process address space, if
1116          * it isn't already there.
1117          */
1118         pmap->pm_stats.resident_count++;
1119
1120         ptepa = VM_PAGE_TO_PHYS(m);
1121         pmap->pm_pdir[ptepindex] = (vpte_t)ptepa | VPTE_R | VPTE_W | VPTE_V |
1122                                    VPTE_A | VPTE_M;
1123
1124         /*
1125          * We are likely about to access this page table page, so set the
1126          * page table hint to reduce overhead.
1127          */
1128         pmap->pm_ptphint = m;
1129
1130         /*
1131          * Try to use the new mapping, but if we cannot, then
1132          * do it with the routine that maps the page explicitly.
1133          */
1134         if ((m->flags & PG_ZERO) == 0)
1135                 pmap_zero_page(ptepa);
1136
1137         m->valid = VM_PAGE_BITS_ALL;
1138         vm_page_flag_clear(m, PG_ZERO);
1139         vm_page_flag_set(m, PG_MAPPED);
1140         vm_page_wakeup(m);
1141
1142         return (m);
1143 }
1144
1145 /*
1146  * Determine the page table page required to access the VA in the pmap
1147  * and allocate it if necessary.  Return a held vm_page_t for the page.
1148  *
1149  * Only used with user pmaps.
1150  */
1151 static vm_page_t
1152 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1153 {
1154         unsigned ptepindex;
1155         vm_offset_t ptepa;
1156         vm_page_t m;
1157
1158         /*
1159          * Calculate pagetable page index
1160          */
1161         ptepindex = va >> PDRSHIFT;
1162
1163         /*
1164          * Get the page directory entry
1165          */
1166         ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1167
1168         /*
1169          * This supports switching from a 4MB page to a
1170          * normal 4K page.
1171          */
1172         if (ptepa & VPTE_PS) {
1173                 pmap->pm_pdir[ptepindex] = 0;
1174                 ptepa = 0;
1175                 cpu_invltlb();
1176                 smp_invltlb();
1177         }
1178
1179         /*
1180          * If the page table page is mapped, we just increment the
1181          * hold count, and activate it.
1182          */
1183         if (ptepa) {
1184                 /*
1185                  * In order to get the page table page, try the
1186                  * hint first.
1187                  */
1188                 if (pmap->pm_ptphint &&
1189                         (pmap->pm_ptphint->pindex == ptepindex)) {
1190                         m = pmap->pm_ptphint;
1191                 } else {
1192                         m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1193                         pmap->pm_ptphint = m;
1194                 }
1195                 m->hold_count++;
1196                 return m;
1197         }
1198         /*
1199          * Here if the pte page isn't mapped, or if it has been deallocated.
1200          */
1201         return _pmap_allocpte(pmap, ptepindex);
1202 }
1203
1204 /************************************************************************
1205  *                      Managed pages in pmaps                          *
1206  ************************************************************************
1207  *
1208  * All pages entered into user pmaps and some pages entered into the kernel
1209  * pmap are managed, meaning that pmap_protect() and other related management
1210  * functions work on these pages.
1211  */
1212
1213 /*
1214  * free the pv_entry back to the free list.  This function may be
1215  * called from an interrupt.
1216  */
1217 static __inline void
1218 free_pv_entry(pv_entry_t pv)
1219 {
1220         pv_entry_count--;
1221         zfree(&pvzone, pv);
1222 }
1223
1224 /*
1225  * get a new pv_entry, allocating a block from the system
1226  * when needed.  This function may be called from an interrupt.
1227  */
1228 static pv_entry_t
1229 get_pv_entry(void)
1230 {
1231         pv_entry_count++;
1232         if (pv_entry_high_water &&
1233                 (pv_entry_count > pv_entry_high_water) &&
1234                 (pmap_pagedaemon_waken == 0)) {
1235                 pmap_pagedaemon_waken = 1;
1236                 wakeup (&vm_pages_needed);
1237         }
1238         return zalloc(&pvzone);
1239 }
1240
1241 /*
1242  * This routine is very drastic, but can save the system
1243  * in a pinch.
1244  */
1245 void
1246 pmap_collect(void)
1247 {
1248         int i;
1249         vm_page_t m;
1250         static int warningdone=0;
1251
1252         if (pmap_pagedaemon_waken == 0)
1253                 return;
1254
1255         if (warningdone < 5) {
1256                 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1257                 warningdone++;
1258         }
1259
1260         for(i = 0; i < vm_page_array_size; i++) {
1261                 m = &vm_page_array[i];
1262                 if (m->wire_count || m->hold_count || m->busy ||
1263                     (m->flags & PG_BUSY))
1264                         continue;
1265                 pmap_remove_all(m);
1266         }
1267         pmap_pagedaemon_waken = 0;
1268 }
1269
1270 /*
1271  * If it is the first entry on the list, it is actually
1272  * in the header and we must copy the following entry up
1273  * to the header.  Otherwise we must search the list for
1274  * the entry.  In either case we free the now unused entry.
1275  */
1276 static int
1277 pmap_remove_entry(struct pmap *pmap, vm_page_t m,
1278                   vm_offset_t va, pmap_inval_info_t info)
1279 {
1280         pv_entry_t pv;
1281         int rtval;
1282
1283         crit_enter();
1284         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1285                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1286                         if (pmap == pv->pv_pmap && va == pv->pv_va)
1287                                 break;
1288                 }
1289         } else {
1290                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1291                         if (va == pv->pv_va)
1292                                 break;
1293                 }
1294         }
1295
1296         /*
1297          * Note that pv_ptem is NULL if the page table page itself is not
1298          * managed, even if the page being removed IS managed.
1299          */
1300         rtval = 0;
1301         if (pv) {
1302                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1303                 m->md.pv_list_count--;
1304                 if (TAILQ_FIRST(&m->md.pv_list) == NULL)
1305                         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1306                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1307                 ++pmap->pm_generation;
1308                 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
1309                 free_pv_entry(pv);
1310         }
1311         crit_exit();
1312         return rtval;
1313 }
1314
1315 /*
1316  * Create a pv entry for page at pa for (pmap, va).  If the page table page
1317  * holding the VA is managed, mpte will be non-NULL.
1318  */
1319 static void
1320 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1321 {
1322         pv_entry_t pv;
1323
1324         crit_enter();
1325         pv = get_pv_entry();
1326         pv->pv_va = va;
1327         pv->pv_pmap = pmap;
1328         pv->pv_ptem = mpte;
1329
1330         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1331         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1332         m->md.pv_list_count++;
1333
1334         crit_exit();
1335 }
1336
1337 /*
1338  * pmap_remove_pte: do the things to unmap a page in a process
1339  */
1340 static int
1341 pmap_remove_pte(struct pmap *pmap, vpte_t *ptq, vm_offset_t va,
1342         pmap_inval_info_t info)
1343 {
1344         vpte_t oldpte;
1345         vm_page_t m;
1346
1347         oldpte = loadandclear(ptq);
1348         pmap_inval_add(info, pmap, va); /* See NOTE: PMAP_INVAL_ADD */
1349         if (oldpte & VPTE_WIRED)
1350                 --pmap->pm_stats.wired_count;
1351         KKASSERT(pmap->pm_stats.wired_count >= 0);
1352
1353 #if 0
1354         /*
1355          * Machines that don't support invlpg, also don't support
1356          * VPTE_G.  XXX VPTE_G is disabled for SMP so don't worry about
1357          * the SMP case.
1358          */
1359         if (oldpte & VPTE_G)
1360                 madvise((void *)va, PAGE_SIZE, MADV_INVAL);
1361 #endif
1362         pmap->pm_stats.resident_count -= 1;
1363         if (oldpte & VPTE_MANAGED) {
1364                 m = PHYS_TO_VM_PAGE(oldpte);
1365                 if (oldpte & VPTE_M) {
1366 #if defined(PMAP_DIAGNOSTIC)
1367                         if (pmap_nw_modified((pt_entry_t) oldpte)) {
1368                                 kprintf(
1369         "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1370                                     va, oldpte);
1371                         }
1372 #endif
1373                         if (pmap_track_modified(pmap, va))
1374                                 vm_page_dirty(m);
1375                 }
1376                 if (oldpte & VPTE_A)
1377                         vm_page_flag_set(m, PG_REFERENCED);
1378                 return pmap_remove_entry(pmap, m, va, info);
1379         } else {
1380                 return pmap_unuse_pt(pmap, va, NULL, info);
1381         }
1382
1383         return 0;
1384 }
1385
1386 /*
1387  * pmap_remove_page:
1388  *
1389  *      Remove a single page from a process address space.
1390  *
1391  *      This function may not be called from an interrupt if the pmap is
1392  *      not kernel_pmap.
1393  */
1394 static void
1395 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
1396 {
1397         vpte_t *ptq;
1398
1399         /*
1400          * if there is no pte for this address, just skip it!!!  Otherwise
1401          * get a local va for mappings for this pmap and remove the entry.
1402          */
1403         if (*pmap_pde(pmap, va) != 0) {
1404                 ptq = get_ptbase(pmap, va);
1405                 if (*ptq) {
1406                         pmap_remove_pte(pmap, ptq, va, info);
1407                 }
1408         }
1409 }
1410
1411 /*
1412  * pmap_remove:
1413  *
1414  *      Remove the given range of addresses from the specified map.
1415  *
1416  *      It is assumed that the start and end are properly
1417  *      rounded to the page size.
1418  *
1419  *      This function may not be called from an interrupt if the pmap is
1420  *      not kernel_pmap.
1421  */
1422 void
1423 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
1424 {
1425         vpte_t *ptbase;
1426         vm_offset_t pdnxt;
1427         vm_offset_t ptpaddr;
1428         vm_pindex_t sindex, eindex;
1429         struct pmap_inval_info info;
1430
1431         if (pmap == NULL)
1432                 return;
1433
1434         KKASSERT(pmap->pm_stats.resident_count >= 0);
1435         if (pmap->pm_stats.resident_count == 0)
1436                 return;
1437
1438         pmap_inval_init(&info);
1439
1440         /*
1441          * special handling of removing one page.  a very
1442          * common operation and easy to short circuit some
1443          * code.
1444          */
1445         if (((sva + PAGE_SIZE) == eva) &&
1446                 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & VPTE_PS) == 0)) {
1447                 pmap_remove_page(pmap, sva, &info);
1448                 pmap_inval_flush(&info);
1449                 return;
1450         }
1451
1452         /*
1453          * Get a local virtual address for the mappings that are being
1454          * worked with.
1455          *
1456          * XXX this is really messy because the kernel pmap is not relative
1457          * to address 0
1458          */
1459         sindex = (sva >> PAGE_SHIFT);
1460         eindex = (eva >> PAGE_SHIFT);
1461
1462         for (; sindex < eindex; sindex = pdnxt) {
1463                 vpte_t pdirindex;
1464
1465                 /*
1466                  * Calculate index for next page table.
1467                  */
1468                 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1469                 if (pmap->pm_stats.resident_count == 0)
1470                         break;
1471
1472                 pdirindex = sindex / NPDEPG;
1473                 if (((ptpaddr = pmap->pm_pdir[pdirindex]) & VPTE_PS) != 0) {
1474                         pmap->pm_pdir[pdirindex] = 0;
1475                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1476                         inval_ptbase_pagedir(pmap, pdirindex);
1477                         continue;
1478                 }
1479
1480                 /*
1481                  * Weed out invalid mappings. Note: we assume that the page
1482                  * directory table is always allocated, and in kernel virtual.
1483                  */
1484                 if (ptpaddr == 0)
1485                         continue;
1486
1487                 /*
1488                  * Limit our scan to either the end of the va represented
1489                  * by the current page table page, or to the end of the
1490                  * range being removed.
1491                  */
1492                 if (pdnxt > eindex)
1493                         pdnxt = eindex;
1494
1495                 /*
1496                  * NOTE: pmap_remove_pte() can block.
1497                  */
1498                 for (; sindex != pdnxt; sindex++) {
1499                         vm_offset_t va;
1500
1501                         ptbase = get_ptbase(pmap, sindex << PAGE_SHIFT);
1502                         if (*ptbase == 0)
1503                                 continue;
1504                         va = i386_ptob(sindex);
1505                         if (pmap_remove_pte(pmap, ptbase, va, &info))
1506                                 break;
1507                 }
1508         }
1509         pmap_inval_flush(&info);
1510 }
1511
1512 /*
1513  * pmap_remove_all:
1514  *
1515  * Removes this physical page from all physical maps in which it resides.
1516  * Reflects back modify bits to the pager.
1517  *
1518  * This routine may not be called from an interrupt.
1519  */
1520 static void
1521 pmap_remove_all(vm_page_t m)
1522 {
1523         struct pmap_inval_info info;
1524         vpte_t *pte, tpte;
1525         pv_entry_t pv;
1526
1527 #if defined(PMAP_DIAGNOSTIC)
1528         /*
1529          * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1530          * pages!
1531          */
1532         if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
1533                 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m));
1534         }
1535 #endif
1536
1537         pmap_inval_init(&info);
1538         crit_enter();
1539         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1540                 pv->pv_pmap->pm_stats.resident_count--;
1541
1542                 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
1543                 KKASSERT(pte != NULL);
1544
1545                 tpte = loadandclear(pte);
1546                 /* See NOTE: PMAP_INVAL_ADD */
1547                 pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
1548                 if (tpte & VPTE_WIRED)
1549                         --pv->pv_pmap->pm_stats.wired_count;
1550                 KKASSERT(pv->pv_pmap->pm_stats.wired_count >= 0);
1551
1552                 if (tpte & VPTE_A)
1553                         vm_page_flag_set(m, PG_REFERENCED);
1554
1555                 /*
1556                  * Update the vm_page_t clean and reference bits.
1557                  */
1558                 if (tpte & VPTE_M) {
1559 #if defined(PMAP_DIAGNOSTIC)
1560                         if (pmap_nw_modified((pt_entry_t) tpte)) {
1561                                 kprintf(
1562         "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1563                                     pv->pv_va, tpte);
1564                         }
1565 #endif
1566                         if (pmap_track_modified(pv->pv_pmap, pv->pv_va))
1567                                 vm_page_dirty(m);
1568                 }
1569                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1570                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1571                 ++pv->pv_pmap->pm_generation;
1572                 m->md.pv_list_count--;
1573                 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
1574                 free_pv_entry(pv);
1575         }
1576
1577         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1578         crit_exit();
1579         pmap_inval_flush(&info);
1580 }
1581
1582 /*
1583  * pmap_protect:
1584  *
1585  *      Set the physical protection on the specified range of this map
1586  *      as requested.
1587  *
1588  *      This function may not be called from an interrupt if the map is
1589  *      not the kernel_pmap.
1590  */
1591 void
1592 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1593 {
1594         vpte_t *ptbase;
1595         vm_offset_t pdnxt, ptpaddr;
1596         vm_pindex_t sindex, eindex;
1597         vm_pindex_t sbase;
1598         pmap_inval_info info;
1599
1600         if (pmap == NULL)
1601                 return;
1602
1603         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1604                 pmap_remove(pmap, sva, eva);
1605                 return;
1606         }
1607
1608         if (prot & VM_PROT_WRITE)
1609                 return;
1610
1611         pmap_inval_init(&info);
1612
1613         ptbase = get_ptbase(pmap, sva);
1614
1615         sindex = (sva >> PAGE_SHIFT);
1616         eindex = (eva >> PAGE_SHIFT);
1617         sbase = sindex;
1618
1619         for (; sindex < eindex; sindex = pdnxt) {
1620
1621                 unsigned pdirindex;
1622
1623                 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1624
1625                 pdirindex = sindex / NPDEPG;
1626                 if (((ptpaddr = pmap->pm_pdir[pdirindex]) & VPTE_PS) != 0) {
1627                         pmap->pm_pdir[pdirindex] &= ~(VPTE_M|VPTE_W);
1628                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1629                         inval_ptbase_pagedir(pmap, pdirindex);
1630                         continue;
1631                 }
1632
1633                 /*
1634                  * Weed out invalid mappings. Note: we assume that the page
1635                  * directory table is always allocated, and in kernel virtual.
1636                  */
1637                 if (ptpaddr == 0)
1638                         continue;
1639
1640                 if (pdnxt > eindex) {
1641                         pdnxt = eindex;
1642                 }
1643
1644                 for (; sindex != pdnxt; sindex++) {
1645                         vpte_t pbits;
1646                         vm_page_t m;
1647
1648                         pbits = ptbase[sindex - sbase];
1649
1650                         if (pbits & VPTE_MANAGED) {
1651                                 m = NULL;
1652                                 if (pbits & VPTE_A) {
1653                                         m = PHYS_TO_VM_PAGE(pbits);
1654                                         vm_page_flag_set(m, PG_REFERENCED);
1655                                         pbits &= ~VPTE_A;
1656                                 }
1657                                 if (pbits & VPTE_M) {
1658                                         if (pmap_track_modified(pmap, i386_ptob(sindex))) {
1659                                                 if (m == NULL)
1660                                                         m = PHYS_TO_VM_PAGE(pbits);
1661                                                 vm_page_dirty(m);
1662                                                 pbits &= ~VPTE_M;
1663                                         }
1664                                 }
1665                         }
1666
1667                         pbits &= ~VPTE_W;
1668
1669                         if (pbits != ptbase[sindex - sbase]) {
1670                                 ptbase[sindex - sbase] = pbits;
1671                                 /* See NOTE: PMAP_INVAL_ADD */
1672                                 pmap_inval_add(&info, pmap, i386_ptob(sindex));
1673                         }
1674                 }
1675         }
1676         pmap_inval_flush(&info);
1677 }
1678
1679 /*
1680  * Enter a managed page into a pmap.  If the page is not wired related pmap
1681  * data can be destroyed at any time for later demand-operation.
1682  *
1683  * Insert the vm_page (m) at virtual address (v) in (pmap), with the
1684  * specified protection, and wire the mapping if requested.
1685  *
1686  * NOTE: This routine may not lazy-evaluate or lose information.  The
1687  * page must actually be inserted into the given map NOW.
1688  *
1689  * NOTE: When entering a page at a KVA address, the pmap must be the
1690  * kernel_pmap.
1691  */
1692 void
1693 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1694            boolean_t wired)
1695 {
1696         vm_paddr_t pa;
1697         vpte_t *pte;
1698         vm_paddr_t opa;
1699         vm_offset_t origpte, newpte;
1700         vm_page_t mpte;
1701         pmap_inval_info info;
1702
1703         if (pmap == NULL)
1704                 return;
1705
1706         va &= VPTE_FRAME;
1707
1708         /*
1709          * Get the page table page.   The kernel_pmap's page table pages
1710          * are preallocated and have no associated vm_page_t.
1711          */
1712         if (pmap == &kernel_pmap)
1713                 mpte = NULL;
1714         else
1715                 mpte = pmap_allocpte(pmap, va);
1716
1717         pmap_inval_init(&info);
1718         pte = pmap_pte(pmap, va);
1719
1720         /*
1721          * Page Directory table entry not valid, we need a new PT page
1722          * and pmap_allocpte() didn't give us one.  Oops!
1723          */
1724         if (pte == NULL) {
1725                 panic("pmap_enter: invalid page directory pmap=%p, va=0x%p\n",
1726                       pmap, (void *)va);
1727         }
1728
1729         pa = VM_PAGE_TO_PHYS(m) & VPTE_FRAME;
1730         origpte = *pte;
1731         opa = origpte & VPTE_FRAME;
1732 #if 0
1733         printf("pmap_enter: pmap %p va %08x pa %08x PDE %08x origpte %08x\n", pmap, va, (int)pa, pmap->pm_pdir[va >> SEG_SHIFT], origpte);
1734 #endif
1735
1736         if (origpte & VPTE_PS)
1737                 panic("pmap_enter: attempted pmap_enter on 4MB page");
1738
1739         /*
1740          * Mapping has not changed, must be protection or wiring change.
1741          */
1742         if (origpte && (opa == pa)) {
1743                 /*
1744                  * Wiring change, just update stats. We don't worry about
1745                  * wiring PT pages as they remain resident as long as there
1746                  * are valid mappings in them. Hence, if a user page is wired,
1747                  * the PT page will be also.
1748                  */
1749                 if (wired && ((origpte & VPTE_WIRED) == 0))
1750                         ++pmap->pm_stats.wired_count;
1751                 else if (!wired && (origpte & VPTE_WIRED))
1752                         --pmap->pm_stats.wired_count;
1753                 KKASSERT(pmap->pm_stats.wired_count >= 0);
1754
1755 #if defined(PMAP_DIAGNOSTIC)
1756                 if (pmap_nw_modified((pt_entry_t) origpte)) {
1757                         kprintf(
1758         "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
1759                             va, origpte);
1760                 }
1761 #endif
1762
1763                 /*
1764                  * Remove the extra pte reference.  Note that we cannot
1765                  * optimize the RO->RW case because we have adjusted the
1766                  * wiring count above and may need to adjust the wiring
1767                  * bits below.
1768                  */
1769                 if (mpte)
1770                         mpte->hold_count--;
1771
1772                 /*
1773                  * We might be turning off write access to the page,
1774                  * so we go ahead and sense modify status.
1775                  */
1776                 if (origpte & VPTE_MANAGED) {
1777                         if ((origpte & VPTE_M) && pmap_track_modified(pmap, va)) {
1778                                 vm_page_t om;
1779                                 om = PHYS_TO_VM_PAGE(opa);
1780                                 vm_page_dirty(om);
1781                         }
1782                         pa |= VPTE_MANAGED;
1783                 }
1784                 goto validate;
1785         }
1786         /*
1787          * Mapping has changed, invalidate old range and fall through to
1788          * handle validating new mapping.
1789          */
1790         if (opa) {
1791                 int err;
1792                 err = pmap_remove_pte(pmap, pte, va, &info);
1793                 if (err)
1794                         panic("pmap_enter: pte vanished, va: 0x%x", va);
1795         }
1796
1797         /*
1798          * Enter on the PV list if part of our managed memory. Note that we
1799          * raise IPL while manipulating pv_table since pmap_enter can be
1800          * called at interrupt time.
1801          */
1802         if (pmap_initialized &&
1803             (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1804                 pmap_insert_entry(pmap, va, mpte, m);
1805                 pa |= VPTE_MANAGED;
1806         }
1807
1808         /*
1809          * Increment counters
1810          */
1811         pmap->pm_stats.resident_count++;
1812         if (wired)
1813                 pmap->pm_stats.wired_count++;
1814
1815 validate:
1816         /*
1817          * Now validate mapping with desired protection/wiring.
1818          */
1819         newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | VPTE_V);
1820
1821         if (wired)
1822                 newpte |= VPTE_WIRED;
1823         newpte |= VPTE_U;
1824
1825         /*
1826          * if the mapping or permission bits are different, we need
1827          * to update the pte.
1828          */
1829         if ((origpte & ~(VPTE_M|VPTE_A)) != newpte) {
1830                 *pte = newpte | VPTE_A;
1831                 /* See NOTE: PMAP_INVAL_ADD */
1832                 pmap_inval_add(&info, pmap, va); /* XXX non-optimal */
1833         }
1834         pmap_inval_flush(&info);
1835 }
1836
1837 /*
1838  * This is a quick version of pmap_enter().  It is used only under the
1839  * following conditions:
1840  *
1841  * (1) The pmap is not the kernel_pmap
1842  * (2) The page is not to be wired into the map
1843  * (3) The page is to mapped read-only in the pmap (initially that is)
1844  * (4) The calling procedure is responsible for flushing the TLB
1845  * (5) The page is always managed
1846  * (6) There is no prior mapping at the VA
1847  */
1848
1849 static vm_page_t
1850 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
1851 {
1852         vpte_t *pte;
1853         vm_paddr_t pa;
1854         pmap_inval_info info;
1855         unsigned ptepindex;
1856         vm_offset_t ptepa;
1857
1858         KKASSERT(pmap != &kernel_pmap);
1859         pmap_inval_init(&info);
1860
1861         KKASSERT(va >= VM_MIN_USER_ADDRESS && va < VM_MAX_USER_ADDRESS);
1862
1863         /*
1864          * Instantiate the page table page if required
1865          */
1866
1867         /*
1868          * Calculate pagetable page index
1869          */
1870         ptepindex = va >> PDRSHIFT;
1871         if (mpte && (mpte->pindex == ptepindex)) {
1872                 mpte->hold_count++;
1873         } else {
1874 retry:
1875                 /*
1876                  * Get the page directory entry
1877                  */
1878                 ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1879
1880                 /*
1881                  * If the page table page is mapped, we just increment
1882                  * the hold count, and activate it.
1883                  */
1884                 if (ptepa) {
1885                         if (ptepa & VPTE_PS)
1886                                 panic("pmap_enter_quick: unexpected mapping into 4MB page");
1887                         if (pmap->pm_ptphint &&
1888                                 (pmap->pm_ptphint->pindex == ptepindex)) {
1889                                 mpte = pmap->pm_ptphint;
1890                         } else {
1891                                 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1892                                 pmap->pm_ptphint = mpte;
1893                         }
1894                         if (mpte == NULL)
1895                                 goto retry;
1896                         mpte->hold_count++;
1897                 } else {
1898                         mpte = _pmap_allocpte(pmap, ptepindex);
1899                 }
1900         }
1901
1902         /*
1903          * Ok, now that the page table page has been validated, get the pte.
1904          * If the pte is already mapped undo mpte's hold_count and
1905          * just return.
1906          */
1907         pte = pmap_pte(pmap, va);
1908         if (*pte) {
1909                 if (mpte)
1910                         pmap_unwire_pte_hold(pmap, mpte, &info);
1911                 return 0;
1912         }
1913
1914         /*
1915          * Enter on the PV list if part of our managed memory. Note that we
1916          * raise IPL while manipulating pv_table since pmap_enter can be
1917          * called at interrupt time.
1918          */
1919         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
1920                 pmap_insert_entry(pmap, va, mpte, m);
1921
1922         /*
1923          * Increment counters
1924          */
1925         pmap->pm_stats.resident_count++;
1926
1927         pa = VM_PAGE_TO_PHYS(m);
1928
1929         /*
1930          * Now validate mapping with RO protection
1931          */
1932         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
1933                 *pte = pa | VPTE_V | VPTE_U;
1934         else
1935                 *pte = pa | VPTE_V | VPTE_U | VPTE_MANAGED;
1936
1937         return mpte;
1938 }
1939
1940 /*
1941  * Extract the physical address for the translation at the specified
1942  * virtual address in the pmap.
1943  */
1944 vm_paddr_t
1945 pmap_extract(pmap_t pmap, vm_offset_t va)
1946 {
1947         vm_paddr_t rtval;
1948         vpte_t pte;
1949
1950         if (pmap && (pte = pmap->pm_pdir[va >> SEG_SHIFT]) != 0) {
1951                 if (pte & VPTE_PS) {
1952                         rtval = pte & ~((vpte_t)(1 << SEG_SHIFT) - 1);
1953                         rtval |= va & SEG_MASK;
1954                 } else {
1955                         pte = *get_ptbase(pmap, va);
1956                         rtval = (pte & VPTE_FRAME) | (va & PAGE_MASK);
1957                 }
1958                 return(rtval);
1959         }
1960         return(0);
1961 }
1962
1963 #define MAX_INIT_PT (96)
1964
1965 /*
1966  * This routine preloads the ptes for a given object into the specified pmap.
1967  * This eliminates the blast of soft faults on process startup and
1968  * immediately after an mmap.
1969  */
1970 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
1971
1972 void
1973 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
1974                     vm_object_t object, vm_pindex_t pindex,
1975                     vm_size_t size, int limit)
1976 {
1977         struct rb_vm_page_scan_info info;
1978         int psize;
1979
1980         /*
1981          * We can't preinit if read access isn't set or there is no pmap
1982          * or object.
1983          */
1984         if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
1985                 return;
1986
1987         /*
1988          * We can't preinit if the pmap is not the current pmap
1989          */
1990         if (curproc == NULL || pmap != vmspace_pmap(curproc->p_vmspace))
1991                 return;
1992
1993         psize = size >> PAGE_SHIFT;
1994
1995         if ((object->type != OBJT_VNODE) ||
1996                 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
1997                         (object->resident_page_count > MAX_INIT_PT))) {
1998                 return;
1999         }
2000
2001         if (psize + pindex > object->size) {
2002                 if (object->size < pindex)
2003                         return;
2004                 psize = object->size - pindex;
2005         }
2006
2007         if (psize == 0)
2008                 return;
2009
2010         /*
2011          * Use a red-black scan to traverse the requested range and load
2012          * any valid pages found into the pmap.
2013          *
2014          * We cannot safely scan the object's memq unless we are in a
2015          * critical section since interrupts can remove pages from objects.
2016          */
2017         info.start_pindex = pindex;
2018         info.end_pindex = pindex + psize - 1;
2019         info.limit = limit;
2020         info.mpte = NULL;
2021         info.addr = addr;
2022         info.pmap = pmap;
2023
2024         crit_enter();
2025         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2026                                 pmap_object_init_pt_callback, &info);
2027         crit_exit();
2028 }
2029
2030 static
2031 int
2032 pmap_object_init_pt_callback(vm_page_t p, void *data)
2033 {
2034         struct rb_vm_page_scan_info *info = data;
2035         vm_pindex_t rel_index;
2036         /*
2037          * don't allow an madvise to blow away our really
2038          * free pages allocating pv entries.
2039          */
2040         if ((info->limit & MAP_PREFAULT_MADVISE) &&
2041                 vmstats.v_free_count < vmstats.v_free_reserved) {
2042                     return(-1);
2043         }
2044         if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2045             (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2046                 if ((p->queue - p->pc) == PQ_CACHE)
2047                         vm_page_deactivate(p);
2048                 vm_page_busy(p);
2049                 rel_index = p->pindex - info->start_pindex;
2050                 info->mpte = pmap_enter_quick(info->pmap,
2051                                               info->addr + i386_ptob(rel_index),
2052                                               p, info->mpte);
2053                 vm_page_flag_set(p, PG_MAPPED);
2054                 vm_page_wakeup(p);
2055         }
2056         return(0);
2057 }
2058
2059 /*
2060  * pmap_prefault provides a quick way of clustering pagefaults into a
2061  * processes address space.  It is a "cousin" of pmap_object_init_pt,
2062  * except it runs at page fault time instead of mmap time.
2063  */
2064 #define PFBAK 4
2065 #define PFFOR 4
2066 #define PAGEORDER_SIZE (PFBAK+PFFOR)
2067
2068 static int pmap_prefault_pageorder[] = {
2069         -PAGE_SIZE, PAGE_SIZE,
2070         -2 * PAGE_SIZE, 2 * PAGE_SIZE,
2071         -3 * PAGE_SIZE, 3 * PAGE_SIZE,
2072         -4 * PAGE_SIZE, 4 * PAGE_SIZE
2073 };
2074
2075 void
2076 pmap_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
2077 {
2078         int i;
2079         vm_offset_t starta;
2080         vm_offset_t addr;
2081         vm_pindex_t pindex;
2082         vm_page_t m, mpte;
2083         vm_object_t object;
2084
2085         /*
2086          * We do not currently prefault mappings that use virtual page
2087          * tables.  We do not prefault foreign pmaps.
2088          */
2089         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
2090                 return;
2091         if (curproc == NULL || (pmap != vmspace_pmap(curproc->p_vmspace)))
2092                 return;
2093
2094         object = entry->object.vm_object;
2095
2096         starta = addra - PFBAK * PAGE_SIZE;
2097         if (starta < entry->start)
2098                 starta = entry->start;
2099         else if (starta > addra)
2100                 starta = 0;
2101
2102         /*
2103          * critical section protection is required to maintain the
2104          * page/object association, interrupts can free pages and remove
2105          * them from their objects.
2106          */
2107         mpte = NULL;
2108         crit_enter();
2109         for (i = 0; i < PAGEORDER_SIZE; i++) {
2110                 vm_object_t lobject;
2111                 vpte_t *pte;
2112
2113                 addr = addra + pmap_prefault_pageorder[i];
2114                 if (addr > addra + (PFFOR * PAGE_SIZE))
2115                         addr = 0;
2116
2117                 if (addr < starta || addr >= entry->end)
2118                         continue;
2119
2120                 /*
2121                  * Make sure the page table page already exists
2122                  */
2123                 if ((*pmap_pde(pmap, addr)) == NULL)
2124                         continue;
2125
2126                 /*
2127                  * Get a pointer to the pte and make sure that no valid page
2128                  * has been mapped.
2129                  */
2130                 pte = get_ptbase(pmap, addr);
2131                 if (*pte)
2132                         continue;
2133
2134                 /*
2135                  * Get the page to be mapped
2136                  */
2137                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2138                 lobject = object;
2139
2140                 for (m = vm_page_lookup(lobject, pindex);
2141                     (!m && (lobject->type == OBJT_DEFAULT) &&
2142                      (lobject->backing_object));
2143                     lobject = lobject->backing_object
2144                 ) {
2145                         if (lobject->backing_object_offset & PAGE_MASK)
2146                                 break;
2147                         pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2148                         m = vm_page_lookup(lobject->backing_object, pindex);
2149                 }
2150
2151                 /*
2152                  * give-up when a page is not in memory
2153                  */
2154                 if (m == NULL)
2155                         break;
2156
2157                 /*
2158                  * If everything meets the requirements for pmap_enter_quick(),
2159                  * then enter the page.
2160                  */
2161
2162                 if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2163                         (m->busy == 0) &&
2164                     (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2165
2166                         if ((m->queue - m->pc) == PQ_CACHE) {
2167                                 vm_page_deactivate(m);
2168                         }
2169                         vm_page_busy(m);
2170                         mpte = pmap_enter_quick(pmap, addr, m, mpte);
2171                         vm_page_flag_set(m, PG_MAPPED);
2172                         vm_page_wakeup(m);
2173                 }
2174         }
2175         crit_exit();
2176 }
2177
2178 /*
2179  *      Routine:        pmap_change_wiring
2180  *      Function:       Change the wiring attribute for a map/virtual-address
2181  *                      pair.
2182  *      In/out conditions:
2183  *                      The mapping must already exist in the pmap.
2184  */
2185 void
2186 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
2187 {
2188         vpte_t *pte;
2189
2190         if (pmap == NULL)
2191                 return;
2192
2193         pte = get_ptbase(pmap, va);
2194
2195         if (wired && (*pte & VPTE_WIRED) == 0)
2196                 ++pmap->pm_stats.wired_count;
2197         else if (!wired && (*pte & VPTE_WIRED))
2198                 --pmap->pm_stats.wired_count;
2199         KKASSERT(pmap->pm_stats.wired_count >= 0);
2200
2201         /*
2202          * Wiring is not a hardware characteristic so there is no need to
2203          * invalidate TLB.  However, in an SMP environment we must use
2204          * a locked bus cycle to update the pte (if we are not using
2205          * the pmap_inval_*() API that is)... it's ok to do this for simple
2206          * wiring changes.
2207          */
2208         if (wired)
2209                 atomic_set_int(pte, VPTE_WIRED);
2210         else
2211                 atomic_clear_int(pte, VPTE_WIRED);
2212 }
2213
2214 /*
2215  *      Copy the range specified by src_addr/len
2216  *      from the source map to the range dst_addr/len
2217  *      in the destination map.
2218  *
2219  *      This routine is only advisory and need not do anything.
2220  */
2221 void
2222 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2223         vm_size_t len, vm_offset_t src_addr)
2224 {
2225         pmap_inval_info info;
2226         vm_offset_t addr;
2227         vm_offset_t end_addr = src_addr + len;
2228         vm_offset_t pdnxt;
2229         vpte_t *src_frame;
2230         vpte_t *dst_frame;
2231         vm_page_t m;
2232
2233         if (dst_addr != src_addr)
2234                 return;
2235         if (dst_pmap->pm_pdir == NULL)
2236                 return;
2237         if (src_pmap->pm_pdir == NULL)
2238                 return;
2239
2240         src_frame = get_ptbase1(src_pmap, src_addr);
2241         dst_frame = get_ptbase2(dst_pmap, src_addr);
2242
2243         pmap_inval_init(&info);
2244 #if 0
2245         /* XXX */
2246         pmap_inval_add(&info, dst_pmap, -1);
2247         pmap_inval_add(&info, src_pmap, -1);
2248 #endif
2249
2250         /*
2251          * critical section protection is required to maintain the page/object
2252          * association, interrupts can free pages and remove them from
2253          * their objects.
2254          */
2255         crit_enter();
2256         for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2257                 vpte_t *src_pte, *dst_pte;
2258                 vm_page_t dstmpte, srcmpte;
2259                 vm_offset_t srcptepaddr;
2260                 unsigned ptepindex;
2261
2262                 if (addr >= VM_MAX_USER_ADDRESS)
2263                         panic("pmap_copy: invalid to pmap_copy page tables\n");
2264
2265                 /*
2266                  * Don't let optional prefaulting of pages make us go
2267                  * way below the low water mark of free pages or way
2268                  * above high water mark of used pv entries.
2269                  */
2270                 if (vmstats.v_free_count < vmstats.v_free_reserved ||
2271                     pv_entry_count > pv_entry_high_water)
2272                         break;
2273
2274                 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2275                 ptepindex = addr >> PDRSHIFT;
2276
2277                 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
2278                 if (srcptepaddr == 0)
2279                         continue;
2280
2281                 if (srcptepaddr & VPTE_PS) {
2282                         if (dst_pmap->pm_pdir[ptepindex] == 0) {
2283                                 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
2284                                 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2285                         }
2286                         continue;
2287                 }
2288
2289                 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2290                 if ((srcmpte == NULL) ||
2291                         (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
2292                         continue;
2293
2294                 if (pdnxt > end_addr)
2295                         pdnxt = end_addr;
2296
2297                 src_pte = src_frame + ((addr - src_addr) >> PAGE_SHIFT);
2298                 dst_pte = dst_frame + ((addr - src_addr) >> PAGE_SHIFT);
2299                 while (addr < pdnxt) {
2300                         vpte_t ptetemp;
2301                         ptetemp = *src_pte;
2302                         /*
2303                          * we only virtual copy managed pages
2304                          */
2305                         if ((ptetemp & VPTE_MANAGED) != 0) {
2306                                 /*
2307                                  * We have to check after allocpte for the
2308                                  * pte still being around...  allocpte can
2309                                  * block.
2310                                  */
2311                                 dstmpte = pmap_allocpte(dst_pmap, addr);
2312                                 if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
2313                                         /*
2314                                          * Clear the modified and accessed
2315                                          * (referenced) bits during the copy.
2316                                          *
2317                                          * We do not have to clear the write
2318                                          * bit to force a fault-on-modify
2319                                          * because the real kernel's target
2320                                          * pmap is empty and will fault anyway.
2321                                          */
2322                                         m = PHYS_TO_VM_PAGE(ptetemp);
2323                                         *dst_pte = ptetemp & ~(VPTE_M | VPTE_A);
2324                                         dst_pmap->pm_stats.resident_count++;
2325                                         pmap_insert_entry(dst_pmap, addr,
2326                                                 dstmpte, m);
2327                                 } else {
2328                                         pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
2329                                 }
2330                                 if (dstmpte->hold_count >= srcmpte->hold_count)
2331                                         break;
2332                         }
2333                         addr += PAGE_SIZE;
2334                         src_pte++;
2335                         dst_pte++;
2336                 }
2337         }
2338         crit_exit();
2339         pmap_inval_flush(&info);
2340 }
2341
2342 /*
2343  * pmap_zero_page:
2344  *
2345  *      Zero the specified PA by mapping the page into KVM and clearing its
2346  *      contents.
2347  *
2348  *      This function may be called from an interrupt and no locking is
2349  *      required.
2350  */
2351 void
2352 pmap_zero_page(vm_paddr_t phys)
2353 {
2354         struct mdglobaldata *gd = mdcpu;
2355
2356         crit_enter();
2357         if (*gd->gd_CMAP3)
2358                 panic("pmap_zero_page: CMAP3 busy");
2359         *gd->gd_CMAP3 = VPTE_V | VPTE_R | VPTE_W | (phys & VPTE_FRAME) | VPTE_A | VPTE_M;
2360         madvise(gd->gd_CADDR3, PAGE_SIZE, MADV_INVAL);
2361
2362         bzero(gd->gd_CADDR3, PAGE_SIZE);
2363         *gd->gd_CMAP3 = 0;
2364         crit_exit();
2365 }
2366
2367 /*
2368  * pmap_page_assertzero:
2369  *
2370  *      Assert that a page is empty, panic if it isn't.
2371  */
2372 void
2373 pmap_page_assertzero(vm_paddr_t phys)
2374 {
2375         struct mdglobaldata *gd = mdcpu;
2376         int i;
2377
2378         crit_enter();
2379         if (*gd->gd_CMAP3)
2380                 panic("pmap_zero_page: CMAP3 busy");
2381         *gd->gd_CMAP3 = VPTE_V | VPTE_R | VPTE_W |
2382                         (phys & VPTE_FRAME) | VPTE_A | VPTE_M;
2383         madvise(gd->gd_CADDR3, PAGE_SIZE, MADV_INVAL);
2384         for (i = 0; i < PAGE_SIZE; i += 4) {
2385             if (*(int *)((char *)gd->gd_CADDR3 + i) != 0) {
2386                 panic("pmap_page_assertzero() @ %p not zero!\n",
2387                     (void *)gd->gd_CADDR3);
2388             }
2389         }
2390         *gd->gd_CMAP3 = 0;
2391         crit_exit();
2392 }
2393
2394 /*
2395  * pmap_zero_page:
2396  *
2397  *      Zero part of a physical page by mapping it into memory and clearing
2398  *      its contents with bzero.
2399  *
2400  *      off and size may not cover an area beyond a single hardware page.
2401  */
2402 void
2403 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
2404 {
2405         struct mdglobaldata *gd = mdcpu;
2406
2407         crit_enter();
2408         if (*gd->gd_CMAP3)
2409                 panic("pmap_zero_page: CMAP3 busy");
2410         *gd->gd_CMAP3 = VPTE_V | VPTE_R | VPTE_W |
2411                         (phys & VPTE_FRAME) | VPTE_A | VPTE_M;
2412         madvise(gd->gd_CADDR3, PAGE_SIZE, MADV_INVAL);
2413
2414         bzero((char *)gd->gd_CADDR3 + off, size);
2415         *gd->gd_CMAP3 = 0;
2416         crit_exit();
2417 }
2418
2419 /*
2420  * pmap_copy_page:
2421  *
2422  *      Copy the physical page from the source PA to the target PA.
2423  *      This function may be called from an interrupt.  No locking
2424  *      is required.
2425  */
2426 void
2427 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
2428 {
2429         struct mdglobaldata *gd = mdcpu;
2430
2431         crit_enter();
2432         if (*(int *) gd->gd_CMAP1)
2433                 panic("pmap_copy_page: CMAP1 busy");
2434         if (*(int *) gd->gd_CMAP2)
2435                 panic("pmap_copy_page: CMAP2 busy");
2436
2437         *(int *) gd->gd_CMAP1 = VPTE_V | VPTE_R | (src & PG_FRAME) | VPTE_A;
2438         *(int *) gd->gd_CMAP2 = VPTE_V | VPTE_R | VPTE_W | (dst & VPTE_FRAME) | VPTE_A | VPTE_M;
2439
2440         madvise(gd->gd_CADDR1, PAGE_SIZE, MADV_INVAL);
2441         madvise(gd->gd_CADDR2, PAGE_SIZE, MADV_INVAL);
2442
2443         bcopy(gd->gd_CADDR1, gd->gd_CADDR2, PAGE_SIZE);
2444
2445         *(int *) gd->gd_CMAP1 = 0;
2446         *(int *) gd->gd_CMAP2 = 0;
2447         crit_exit();
2448 }
2449
2450 /*
2451  * pmap_copy_page_frag:
2452  *
2453  *      Copy the physical page from the source PA to the target PA.
2454  *      This function may be called from an interrupt.  No locking
2455  *      is required.
2456  */
2457 void
2458 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
2459 {
2460         struct mdglobaldata *gd = mdcpu;
2461
2462         crit_enter();
2463         if (*(int *) gd->gd_CMAP1)
2464                 panic("pmap_copy_page: CMAP1 busy");
2465         if (*(int *) gd->gd_CMAP2)
2466                 panic("pmap_copy_page: CMAP2 busy");
2467
2468         *(int *) gd->gd_CMAP1 = VPTE_V | (src & VPTE_FRAME) | VPTE_A;
2469         *(int *) gd->gd_CMAP2 = VPTE_V | VPTE_R | VPTE_W | (dst & VPTE_FRAME) | VPTE_A | VPTE_M;
2470
2471         madvise(gd->gd_CADDR1, PAGE_SIZE, MADV_INVAL);
2472         madvise(gd->gd_CADDR2, PAGE_SIZE, MADV_INVAL);
2473
2474         bcopy((char *)gd->gd_CADDR1 + (src & PAGE_MASK),
2475               (char *)gd->gd_CADDR2 + (dst & PAGE_MASK),
2476               bytes);
2477
2478         *(int *) gd->gd_CMAP1 = 0;
2479         *(int *) gd->gd_CMAP2 = 0;
2480         crit_exit();
2481 }
2482
2483 /*
2484  * Returns true if the pmap's pv is one of the first
2485  * 16 pvs linked to from this page.  This count may
2486  * be changed upwards or downwards in the future; it
2487  * is only necessary that true be returned for a small
2488  * subset of pmaps for proper page aging.
2489  */
2490 boolean_t
2491 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2492 {
2493         pv_entry_t pv;
2494         int loops = 0;
2495
2496         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2497                 return FALSE;
2498
2499         crit_enter();
2500
2501         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2502                 if (pv->pv_pmap == pmap) {
2503                         crit_exit();
2504                         return TRUE;
2505                 }
2506                 loops++;
2507                 if (loops >= 16)
2508                         break;
2509         }
2510         crit_exit();
2511         return (FALSE);
2512 }
2513
2514 /*
2515  * Remove all pages from specified address space
2516  * this aids process exit speeds.  Also, this code
2517  * is special cased for current process only, but
2518  * can have the more generic (and slightly slower)
2519  * mode enabled.  This is much faster than pmap_remove
2520  * in the case of running down an entire address space.
2521  */
2522 void
2523 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2524 {
2525         vpte_t *pte, tpte;
2526         pv_entry_t pv, npv;
2527         vm_page_t m;
2528         pmap_inval_info info;
2529         int iscurrentpmap;
2530         int32_t save_generation;
2531
2532         if (curproc && pmap == vmspace_pmap(curproc->p_vmspace))
2533                 iscurrentpmap = 1;
2534         else
2535                 iscurrentpmap = 0;
2536
2537         pmap_inval_init(&info);
2538         crit_enter();
2539         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2540                 if (pv->pv_va >= eva || pv->pv_va < sva) {
2541                         npv = TAILQ_NEXT(pv, pv_plist);
2542                         continue;
2543                 }
2544
2545                 KKASSERT(pmap == pv->pv_pmap);
2546
2547                 pte = pmap_pte(pmap, pv->pv_va);
2548                 tpte = *pte;
2549
2550                 /*
2551                  * We cannot remove wired pages from a process' mapping
2552                  * at this time
2553                  */
2554                 if (tpte & VPTE_WIRED) {
2555                         npv = TAILQ_NEXT(pv, pv_plist);
2556                         continue;
2557                 }
2558                 *pte = 0;
2559                 /* See NOTE: PMAP_INVAL_ADD */
2560                 pmap_inval_add(&info, pmap, pv->pv_va);
2561
2562                 m = PHYS_TO_VM_PAGE(tpte);
2563
2564                 KASSERT(m < &vm_page_array[vm_page_array_size],
2565                         ("pmap_remove_pages: bad tpte %x", tpte));
2566
2567                 pmap->pm_stats.resident_count--;
2568
2569                 /*
2570                  * Update the vm_page_t clean and reference bits.
2571                  */
2572                 if (tpte & VPTE_M) {
2573                         vm_page_dirty(m);
2574                 }
2575
2576                 npv = TAILQ_NEXT(pv, pv_plist);
2577                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2578                 save_generation = ++pmap->pm_generation;
2579
2580                 m->md.pv_list_count--;
2581                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2582                 if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
2583                         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
2584                 }
2585
2586                 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info);
2587                 free_pv_entry(pv);
2588
2589                 /*
2590                  * Restart the scan if we blocked during the unuse or free
2591                  * calls and other removals were made.
2592                  */
2593                 if (save_generation != pmap->pm_generation) {
2594                         kprintf("Warning: pmap_remove_pages race-A avoided\n");
2595                         pv = TAILQ_FIRST(&pmap->pm_pvlist);
2596                 }
2597         }
2598         pmap_inval_flush(&info);
2599         crit_exit();
2600 }
2601
2602 /*
2603  * pmap_testbit tests bits in pte's
2604  * note that the testbit/changebit routines are inline,
2605  * and a lot of things compile-time evaluate.
2606  */
2607 static boolean_t
2608 pmap_testbit(vm_page_t m, int bit)
2609 {
2610         pv_entry_t pv;
2611         vpte_t *pte;
2612
2613         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2614                 return FALSE;
2615
2616         if (TAILQ_FIRST(&m->md.pv_list) == NULL)
2617                 return FALSE;
2618
2619         crit_enter();
2620
2621         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2622                 /*
2623                  * if the bit being tested is the modified bit, then
2624                  * mark clean_map and ptes as never
2625                  * modified.
2626                  */
2627                 if (bit & (VPTE_A|VPTE_M)) {
2628                         if (!pmap_track_modified(pv->pv_pmap, pv->pv_va))
2629                                 continue;
2630                 }
2631
2632 #if defined(PMAP_DIAGNOSTIC)
2633                 if (!pv->pv_pmap) {
2634                         kprintf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
2635                         continue;
2636                 }
2637 #endif
2638                 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2639                 if (*pte & bit) {
2640                         crit_exit();
2641                         return TRUE;
2642                 }
2643         }
2644         crit_exit();
2645         return (FALSE);
2646 }
2647
2648 /*
2649  * This routine is used to clear bits in ptes.  Certain bits require special
2650  * handling, in particular (on virtual kernels) the VPTE_M (modify) bit.
2651  */
2652 static __inline void
2653 pmap_clearbit(vm_page_t m, int bit)
2654 {
2655         struct pmap_inval_info info;
2656         pv_entry_t pv;
2657         vpte_t *pte;
2658         vpte_t pbits;
2659
2660         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2661                 return;
2662
2663         pmap_inval_init(&info);
2664         crit_enter();
2665
2666         /*
2667          * Loop over all current mappings setting/clearing as appropos If
2668          * setting RO do we need to clear the VAC?
2669          */
2670         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2671                 /*
2672                  * don't write protect pager mappings
2673                  */
2674                 if (bit == VPTE_W) {
2675                         if (!pmap_track_modified(pv->pv_pmap, pv->pv_va))
2676                                 continue;
2677                 }
2678
2679 #if defined(PMAP_DIAGNOSTIC)
2680                 if (!pv->pv_pmap) {
2681                         kprintf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
2682                         continue;
2683                 }
2684 #endif
2685
2686                 /*
2687                  * Careful here.  We can use a locked bus instruction to
2688                  * clear VPTE_A or VPTE_M safely but we need to synchronize
2689                  * with the target cpus when we mess with VPTE_W.
2690                  *
2691                  * On virtual kernels we must force a new fault-on-write
2692                  * in the real kernel if we clear the Modify bit ourselves,
2693                  * otherwise the real kernel will not get a new fault and
2694                  * will never set our Modify bit again.
2695                  */
2696                 pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2697                 if (bit & (VPTE_W|VPTE_M))
2698                         pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
2699
2700                 pbits = *pte;
2701                 if (pbits & bit) {
2702                         if (bit == VPTE_W) {
2703                                 if (pbits & VPTE_M) {
2704                                         vm_page_dirty(m);
2705                                 }
2706                                 atomic_clear_int(pte, VPTE_M|VPTE_W);
2707                         } else if (bit == VPTE_M) {
2708                                 /*
2709                                  * We do not have to make the page read-only
2710                                  * when clearing the Modify bit.  The real
2711                                  * kernel will make the real PTE read-only
2712                                  * or otherwise detect the write and set
2713                                  * our VPTE_M again simply by us invalidating
2714                                  * the real kernel VA for the pmap (as we did
2715                                  * above).  This allows the real kernel to
2716                                  * handle the write fault without forwarding
2717                                  * the fault to us.
2718                                  */
2719                                 atomic_clear_int(pte, VPTE_M);
2720                         } else {
2721                                 atomic_clear_int(pte, bit);
2722                         }
2723                 }
2724         }
2725         pmap_inval_flush(&info);
2726         crit_exit();
2727 }
2728
2729 /*
2730  *      pmap_page_protect:
2731  *
2732  *      Lower the permission for all mappings to a given page.
2733  */
2734 void
2735 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2736 {
2737         if ((prot & VM_PROT_WRITE) == 0) {
2738                 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2739                         pmap_clearbit(m, VPTE_W);
2740                 } else {
2741                         pmap_remove_all(m);
2742                 }
2743         }
2744 }
2745
2746 vm_paddr_t
2747 pmap_phys_address(int ppn)
2748 {
2749         return (i386_ptob(ppn));
2750 }
2751
2752 /*
2753  *      pmap_ts_referenced:
2754  *
2755  *      Return a count of reference bits for a page, clearing those bits.
2756  *      It is not necessary for every reference bit to be cleared, but it
2757  *      is necessary that 0 only be returned when there are truly no
2758  *      reference bits set.
2759  *
2760  *      XXX: The exact number of bits to check and clear is a matter that
2761  *      should be tested and standardized at some point in the future for
2762  *      optimal aging of shared pages.
2763  */
2764 int
2765 pmap_ts_referenced(vm_page_t m)
2766 {
2767         pv_entry_t pv, pvf, pvn;
2768         vpte_t *pte;
2769         int rtval = 0;
2770
2771         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2772                 return (rtval);
2773
2774         crit_enter();
2775
2776         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2777
2778                 pvf = pv;
2779
2780                 do {
2781                         pvn = TAILQ_NEXT(pv, pv_list);
2782
2783                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2784
2785                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2786
2787                         if (!pmap_track_modified(pv->pv_pmap, pv->pv_va))
2788                                 continue;
2789
2790                         pte = pmap_pte(pv->pv_pmap, pv->pv_va);
2791
2792                         if (pte && (*pte & VPTE_A)) {
2793 #ifdef SMP
2794                                 atomic_clear_int(pte, VPTE_A);
2795 #else
2796                                 atomic_clear_int_nonlocked(pte, VPTE_A);
2797 #endif
2798                                 rtval++;
2799                                 if (rtval > 4) {
2800                                         break;
2801                                 }
2802                         }
2803                 } while ((pv = pvn) != NULL && pv != pvf);
2804         }
2805         crit_exit();
2806
2807         return (rtval);
2808 }
2809
2810 /*
2811  *      pmap_is_modified:
2812  *
2813  *      Return whether or not the specified physical page was modified
2814  *      in any physical maps.
2815  */
2816 boolean_t
2817 pmap_is_modified(vm_page_t m)
2818 {
2819         return pmap_testbit(m, VPTE_M);
2820 }
2821
2822 /*
2823  *      Clear the modify bits on the specified physical page.
2824  */
2825 void
2826 pmap_clear_modify(vm_page_t m)
2827 {
2828         pmap_clearbit(m, VPTE_M);
2829 }
2830
2831 /*
2832  *      pmap_clear_reference:
2833  *
2834  *      Clear the reference bit on the specified physical page.
2835  */
2836 void
2837 pmap_clear_reference(vm_page_t m)
2838 {
2839         pmap_clearbit(m, VPTE_A);
2840 }
2841
2842 /*
2843  * Miscellaneous support routines follow
2844  */
2845
2846 static void
2847 i386_protection_init(void)
2848 {
2849         int *kp, prot;
2850
2851         kp = protection_codes;
2852         for (prot = 0; prot < 8; prot++) {
2853                 if (prot & VM_PROT_READ)
2854                         *kp |= VPTE_R;
2855                 if (prot & VM_PROT_WRITE)
2856                         *kp |= VPTE_W;
2857                 if (prot & VM_PROT_EXECUTE)
2858                         *kp |= VPTE_X;
2859                 ++kp;
2860         }
2861 }
2862
2863 /*
2864  * Map a set of physical memory pages into the kernel virtual
2865  * address space. Return a pointer to where it is mapped. This
2866  * routine is intended to be used for mapping device memory,
2867  * NOT real memory.
2868  *
2869  * NOTE: we can't use pgeflag unless we invalidate the pages one at
2870  * a time.
2871  */
2872 void *
2873 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
2874 {
2875         vm_offset_t va, tmpva, offset;
2876         vpte_t *pte;
2877
2878         offset = pa & PAGE_MASK;
2879         size = roundup(offset + size, PAGE_SIZE);
2880
2881         va = kmem_alloc_nofault(&kernel_map, size);
2882         if (!va)
2883                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2884
2885         pa = pa & VPTE_FRAME;
2886         for (tmpva = va; size > 0;) {
2887                 pte = KernelPTA + (tmpva >> PAGE_SHIFT);
2888                 *pte = pa | VPTE_R | VPTE_W | VPTE_V; /* | pgeflag; */
2889                 size -= PAGE_SIZE;
2890                 tmpva += PAGE_SIZE;
2891                 pa += PAGE_SIZE;
2892         }
2893         cpu_invltlb();
2894         smp_invltlb();
2895
2896         return ((void *)(va + offset));
2897 }
2898
2899 void
2900 pmap_unmapdev(vm_offset_t va, vm_size_t size)
2901 {
2902         vm_offset_t base, offset;
2903
2904         base = va & VPTE_FRAME;
2905         offset = va & PAGE_MASK;
2906         size = roundup(offset + size, PAGE_SIZE);
2907         pmap_qremove(va, size >> PAGE_SHIFT);
2908         kmem_free(&kernel_map, base, size);
2909 }
2910
2911 /*
2912  * perform the pmap work for mincore
2913  */
2914 int
2915 pmap_mincore(pmap_t pmap, vm_offset_t addr)
2916 {
2917         vpte_t *ptep, pte;
2918         vm_page_t m;
2919         int val = 0;
2920
2921         ptep = pmap_pte(pmap, addr);
2922         if (ptep == 0) {
2923                 return 0;
2924         }
2925
2926         if ((pte = *ptep) != 0) {
2927                 vm_offset_t pa;
2928
2929                 val = MINCORE_INCORE;
2930                 if ((pte & VPTE_MANAGED) == 0)
2931                         return val;
2932
2933                 pa = pte & VPTE_FRAME;
2934
2935                 m = PHYS_TO_VM_PAGE(pa);
2936
2937                 /*
2938                  * Modified by us
2939                  */
2940                 if (pte & VPTE_M)
2941                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2942                 /*
2943                  * Modified by someone
2944                  */
2945                 else if (m->dirty || pmap_is_modified(m))
2946                         val |= MINCORE_MODIFIED_OTHER;
2947                 /*
2948                  * Referenced by us
2949                  */
2950                 if (pte & VPTE_A)
2951                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
2952
2953                 /*
2954                  * Referenced by someone
2955                  */
2956                 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
2957                         val |= MINCORE_REFERENCED_OTHER;
2958                         vm_page_flag_set(m, PG_REFERENCED);
2959                 }
2960         }
2961         return val;
2962 }
2963
2964 void
2965 pmap_activate(struct proc *p)
2966 {
2967         pmap_t  pmap;
2968
2969         pmap = vmspace_pmap(p->p_vmspace);
2970 #if defined(SMP)
2971         atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
2972 #else
2973         pmap->pm_active |= 1;
2974 #endif
2975 #if defined(SWTCH_OPTIM_STATS)
2976         tlb_flush_count++;
2977 #endif
2978 #if 0
2979         KKASSERT((p == curproc));
2980
2981         curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pdir);
2982         load_cr3(curthread->td_pcb->pcb_cr3);
2983 #endif
2984 }
2985
2986 void
2987 pmap_deactivate(struct proc *p)
2988 {
2989         pmap_t  pmap;
2990
2991         pmap = vmspace_pmap(p->p_vmspace);
2992 #if defined(SMP)
2993         atomic_clear_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
2994 #else
2995         pmap->pm_active &= ~1;
2996 #endif
2997         /*
2998          * XXX - note we do not adjust %cr3.  The caller is expected to
2999          * activate a new pmap or do a thread-exit.
3000          */
3001 }
3002
3003 vm_offset_t
3004 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3005 {
3006
3007         if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3008                 return addr;
3009         }
3010
3011         addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3012         return addr;
3013 }
3014
3015
3016 #if defined(DEBUG)
3017
3018 static void     pads (pmap_t pm);
3019 void            pmap_pvdump (vm_paddr_t pa);
3020
3021 /* print address space of pmap*/
3022 static void
3023 pads(pmap_t pm)
3024 {
3025         vm_offset_t va;
3026         int i, j;
3027         vpte_t *ptep;
3028
3029         if (pm == &kernel_pmap)
3030                 return;
3031         for (i = 0; i < 1024; i++)
3032                 if (pm->pm_pdir[i])
3033                         for (j = 0; j < 1024; j++) {
3034                                 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3035                                 if (pm == &kernel_pmap && va < KERNBASE)
3036                                         continue;
3037                                 if (pm != &kernel_pmap && va > UPT_MAX_ADDRESS)
3038                                         continue;
3039                                 ptep = pmap_pte(pm, va);
3040                                 if (ptep && (*ptep & VPTE_V)) {
3041                                         kprintf("%p:%x ",
3042                                                 (void *)va, (unsigned)*ptep);
3043                                 }
3044                         };
3045
3046 }
3047
3048 void
3049 pmap_pvdump(vm_paddr_t pa)
3050 {
3051         pv_entry_t pv;
3052         vm_page_t m;
3053
3054         kprintf("pa %08llx", (long long)pa);
3055         m = PHYS_TO_VM_PAGE(pa);
3056         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3057 #ifdef used_to_be
3058                 kprintf(" -> pmap %p, va %x, flags %x",
3059                     (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags);
3060 #endif
3061                 kprintf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3062                 pads(pv->pv_pmap);
3063         }
3064         kprintf(" ");
3065 }
3066 #endif
3067