sys/vm/vm_map.c

   1 /*
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * This code is derived from software contributed to The DragonFly Project
  10  * by Matthew Dillon <dillon@backplane.com>
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  37  *
  38  *
  39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  40  * All rights reserved.
  41  *
  42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  43  *
  44  * Permission to use, copy, modify and distribute this software and
  45  * its documentation is hereby granted, provided that both the copyright
  46  * notice and this permission notice appear in all copies of the
  47  * software, derivative works or modified versions, and any portions
  48  * thereof, and that both notices appear in supporting documentation.
  49  *
  50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  53  *
  54  * Carnegie Mellon requests users of this software to return to
  55  *
  56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  57  *  School of Computer Science
  58  *  Carnegie Mellon University
  59  *  Pittsburgh PA 15213-3890
  60  *
  61  * any improvements or extensions that they make and grant Carnegie the
  62  * rights to redistribute these changes.
  63  *
  64  * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $
  65  */
  66
  67 /*
  68  *      Virtual memory mapping module.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/kernel.h>
  74 #include <sys/proc.h>
  75 #include <sys/serialize.h>
  76 #include <sys/lock.h>
  77 #include <sys/vmmeter.h>
  78 #include <sys/mman.h>
  79 #include <sys/vnode.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/shm.h>
  82 #include <sys/tree.h>
  83 #include <sys/malloc.h>
  84 #include <sys/objcache.h>
  85 #include <sys/kern_syscall.h>
  86
  87 #include <vm/vm.h>
  88 #include <vm/vm_param.h>
  89 #include <vm/pmap.h>
  90 #include <vm/vm_map.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_pager.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_extern.h>
  96 #include <vm/swap_pager.h>
  97 #include <vm/vm_zone.h>
  98
  99 #include <sys/random.h>
 100 #include <sys/sysctl.h>
 101 #include <sys/spinlock.h>
 102
 103 #include <sys/thread2.h>
 104 #include <sys/spinlock2.h>
 105
 106 /*
 107  * Virtual memory maps provide for the mapping, protection, and sharing
 108  * of virtual memory objects.  In addition, this module provides for an
 109  * efficient virtual copy of memory from one map to another.
 110  *
 111  * Synchronization is required prior to most operations.
 112  *
 113  * Maps consist of an ordered doubly-linked list of simple entries.
 114  * A hint and a RB tree is used to speed-up lookups.
 115  *
 116  * Callers looking to modify maps specify start/end addresses which cause
 117  * the related map entry to be clipped if necessary, and then later
 118  * recombined if the pieces remained compatible.
 119  *
 120  * Virtual copy operations are performed by copying VM object references
 121  * from one map to another, and then marking both regions as copy-on-write.
 122  */
 123 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
 124 static void vmspace_dtor(void *obj, void *privdata);
 125 static void vmspace_terminate(struct vmspace *vm, int final);
 126
 127 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
 128 static struct objcache *vmspace_cache;
 129
 130 /*
 131  * per-cpu page table cross mappings are initialized in early boot
 132  * and might require a considerable number of vm_map_entry structures.
 133  */
 134 #define MAPENTRYBSP_CACHE       (MAXCPU+1)
 135 #define MAPENTRYAP_CACHE        8
 136
 137 /*
 138  * Partioning threaded programs with large anonymous memory areas can
 139  * improve concurrent fault performance.
 140  */
 141 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
 142 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
 143
 144 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)    \
 145         ((((entry)->start ^ (entry)->end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
 146
 147 static struct vm_zone mapentzone_store;
 148 static vm_zone_t mapentzone;
 149
 150 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 151 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
 152 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
 153
 154 static int randomize_mmap;
 155 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
 156     "Randomize mmap offsets");
 157 static int vm_map_relock_enable = 1;
 158 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
 159            &vm_map_relock_enable, 0, "insert pop pgtable optimization");
 160 static int vm_map_partition_enable = 1;
 161 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
 162            &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
 163
 164 static void vmspace_drop_notoken(struct vmspace *vm);
 165 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref);
 166 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *);
 167 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
 168 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 169 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
 170 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
 171 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
 172 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
 173                 vm_map_entry_t);
 174 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
 175                 vm_offset_t start, vm_offset_t end, int *countp, int flags);
 176 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
 177                 vm_offset_t vaddr, int *countp);
 178
 179 /*
 180  * Initialize the vm_map module.  Must be called before any other vm_map
 181  * routines.
 182  *
 183  * Map and entry structures are allocated from the general purpose
 184  * memory pool with some exceptions:
 185  *
 186  *      - The kernel map is allocated statically.
 187  *      - Initial kernel map entries are allocated out of a static pool.
 188  *      - We must set ZONE_SPECIAL here or the early boot code can get
 189  *        stuck if there are >63 cores.
 190  *
 191  *      These restrictions are necessary since malloc() uses the
 192  *      maps and requires map entries.
 193  *
 194  * Called from the low level boot code only.
 195  */
 196 void
 197 vm_map_startup(void)
 198 {
 199         mapentzone = &mapentzone_store;
 200         zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 201                   map_entry_init, MAX_MAPENT);
 202         mapentzone_store.zflags |= ZONE_SPECIAL;
 203 }
 204
 205 /*
 206  * Called prior to any vmspace allocations.
 207  *
 208  * Called from the low level boot code only.
 209  */
 210 void
 211 vm_init2(void)
 212 {
 213         vmspace_cache = objcache_create_mbacked(M_VMSPACE,
 214                                                 sizeof(struct vmspace),
 215                                                 0, ncpus * 4,
 216                                                 vmspace_ctor, vmspace_dtor,
 217                                                 NULL);
 218         zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
 219         pmap_init2();
 220         vm_object_init2();
 221 }
 222
 223 /*
 224  * objcache support.  We leave the pmap root cached as long as possible
 225  * for performance reasons.
 226  */
 227 static
 228 boolean_t
 229 vmspace_ctor(void *obj, void *privdata, int ocflags)
 230 {
 231         struct vmspace *vm = obj;
 232
 233         bzero(vm, sizeof(*vm));
 234         vm->vm_refcnt = VM_REF_DELETED;
 235
 236         return 1;
 237 }
 238
 239 static
 240 void
 241 vmspace_dtor(void *obj, void *privdata)
 242 {
 243         struct vmspace *vm = obj;
 244
 245         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 246         pmap_puninit(vmspace_pmap(vm));
 247 }
 248
 249 /*
 250  * Red black tree functions
 251  *
 252  * The caller must hold the related map lock.
 253  */
 254 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
 255 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
 256
 257 /* a->start is address, and the only field has to be initialized */
 258 static int
 259 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
 260 {
 261         if (a->start < b->start)
 262                 return(-1);
 263         else if (a->start > b->start)
 264                 return(1);
 265         return(0);
 266 }
 267
 268 /*
 269  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
 270  * every refcnt.
 271  */
 272 void
 273 vmspace_initrefs(struct vmspace *vm)
 274 {
 275         vm->vm_refcnt = 1;
 276         vm->vm_holdcnt = 1;
 277 }
 278
 279 /*
 280  * Allocate a vmspace structure, including a vm_map and pmap.
 281  * Initialize numerous fields.  While the initial allocation is zerod,
 282  * subsequence reuse from the objcache leaves elements of the structure
 283  * intact (particularly the pmap), so portions must be zerod.
 284  *
 285  * Returns a referenced vmspace.
 286  *
 287  * No requirements.
 288  */
 289 struct vmspace *
 290 vmspace_alloc(vm_offset_t min, vm_offset_t max)
 291 {
 292         struct vmspace *vm;
 293
 294         vm = objcache_get(vmspace_cache, M_WAITOK);
 295
 296         bzero(&vm->vm_startcopy,
 297               (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
 298         vm_map_init(&vm->vm_map, min, max, NULL);       /* initializes token */
 299
 300         /*
 301          * NOTE: hold to acquires token for safety.
 302          *
 303          * On return vmspace is referenced (refs=1, hold=1).  That is,
 304          * each refcnt also has a holdcnt.  There can be additional holds
 305          * (holdcnt) above and beyond the refcnt.  Finalization is handled in
 306          * two stages, one on refs 1->0, and the the second on hold 1->0.
 307          */
 308         KKASSERT(vm->vm_holdcnt == 0);
 309         KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
 310         vmspace_initrefs(vm);
 311         vmspace_hold(vm);
 312         pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
 313         vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
 314         vm->vm_shm = NULL;
 315         vm->vm_flags = 0;
 316         cpu_vmspace_alloc(vm);
 317         vmspace_drop(vm);
 318
 319         return (vm);
 320 }
 321
 322 /*
 323  * NOTE: Can return 0 if the vmspace is exiting.
 324  */
 325 int
 326 vmspace_getrefs(struct vmspace *vm)
 327 {
 328         int32_t n;
 329
 330         n = vm->vm_refcnt;
 331         cpu_ccfence();
 332         if (n & VM_REF_DELETED)
 333                 n = -1;
 334         return n;
 335 }
 336
 337 void
 338 vmspace_hold(struct vmspace *vm)
 339 {
 340         atomic_add_int(&vm->vm_holdcnt, 1);
 341         lwkt_gettoken(&vm->vm_map.token);
 342 }
 343
 344 /*
 345  * Drop with final termination interlock.
 346  */
 347 void
 348 vmspace_drop(struct vmspace *vm)
 349 {
 350         lwkt_reltoken(&vm->vm_map.token);
 351         vmspace_drop_notoken(vm);
 352 }
 353
 354 static void
 355 vmspace_drop_notoken(struct vmspace *vm)
 356 {
 357         if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
 358                 if (vm->vm_refcnt & VM_REF_DELETED)
 359                         vmspace_terminate(vm, 1);
 360         }
 361 }
 362
 363 /*
 364  * A vmspace object must not be in a terminated state to be able to obtain
 365  * additional refs on it.
 366  *
 367  * These are official references to the vmspace, the count is used to check
 368  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
 369  *
 370  * XXX we need to combine hold & ref together into one 64-bit field to allow
 371  * holds to prevent stage-1 termination.
 372  */
 373 void
 374 vmspace_ref(struct vmspace *vm)
 375 {
 376         uint32_t n;
 377
 378         atomic_add_int(&vm->vm_holdcnt, 1);
 379         n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
 380         KKASSERT((n & VM_REF_DELETED) == 0);
 381 }
 382
 383 /*
 384  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
 385  * termination of the vmspace.  Then, on the final drop of the hold we
 386  * will do stage-2 final termination.
 387  */
 388 void
 389 vmspace_rel(struct vmspace *vm)
 390 {
 391         uint32_t n;
 392
 393         /*
 394          * Drop refs.  Each ref also has a hold which is also dropped.
 395          *
 396          * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
 397          * prevent finalization) to start termination processing.
 398          * Finalization occurs when the last hold count drops to 0.
 399          */
 400         n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
 401         while (n == 0) {
 402                 if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
 403                         vmspace_terminate(vm, 0);
 404                         break;
 405                 }
 406                 n = vm->vm_refcnt;
 407                 cpu_ccfence();
 408         }
 409         vmspace_drop_notoken(vm);
 410 }
 411
 412 /*
 413  * This is called during exit indicating that the vmspace is no
 414  * longer in used by an exiting process, but the process has not yet
 415  * been reaped.
 416  *
 417  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
 418  * to prevent stage-2 until the process is reaped.  Note hte order of
 419  * operation, we must hold first.
 420  *
 421  * No requirements.
 422  */
 423 void
 424 vmspace_relexit(struct vmspace *vm)
 425 {
 426         atomic_add_int(&vm->vm_holdcnt, 1);
 427         vmspace_rel(vm);
 428 }
 429
 430 /*
 431  * Called during reap to disconnect the remainder of the vmspace from
 432  * the process.  On the hold drop the vmspace termination is finalized.
 433  *
 434  * No requirements.
 435  */
 436 void
 437 vmspace_exitfree(struct proc *p)
 438 {
 439         struct vmspace *vm;
 440
 441         vm = p->p_vmspace;
 442         p->p_vmspace = NULL;
 443         vmspace_drop_notoken(vm);
 444 }
 445
 446 /*
 447  * Called in two cases:
 448  *
 449  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
 450  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
 451  *     and holdcnt will still be non-zero.
 452  *
 453  * (2) When holdcnt becomes 0, called with final == 1.  There should no
 454  *     longer be anyone with access to the vmspace.
 455  *
 456  * VMSPACE_EXIT1 flags the primary deactivation
 457  * VMSPACE_EXIT2 flags the last reap
 458  */
 459 static void
 460 vmspace_terminate(struct vmspace *vm, int final)
 461 {
 462         int count;
 463
 464         lwkt_gettoken(&vm->vm_map.token);
 465         if (final == 0) {
 466                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
 467                 vm->vm_flags |= VMSPACE_EXIT1;
 468
 469                 /*
 470                  * Get rid of most of the resources.  Leave the kernel pmap
 471                  * intact.
 472                  *
 473                  * If the pmap does not contain wired pages we can bulk-delete
 474                  * the pmap as a performance optimization before removing the
 475                  * related mappings.
 476                  *
 477                  * If the pmap contains wired pages we cannot do this
 478                  * pre-optimization because currently vm_fault_unwire()
 479                  * expects the pmap pages to exist and will not decrement
 480                  * p->wire_count if they do not.
 481                  */
 482                 shmexit(vm);
 483                 if (vmspace_pmap(vm)->pm_stats.wired_count) {
 484                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 485                                       VM_MAX_USER_ADDRESS);
 486                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 487                                           VM_MAX_USER_ADDRESS);
 488                 } else {
 489                         pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
 490                                           VM_MAX_USER_ADDRESS);
 491                         vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
 492                                       VM_MAX_USER_ADDRESS);
 493                 }
 494                 lwkt_reltoken(&vm->vm_map.token);
 495         } else {
 496                 KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
 497                 KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
 498
 499                 /*
 500                  * Get rid of remaining basic resources.
 501                  */
 502                 vm->vm_flags |= VMSPACE_EXIT2;
 503                 shmexit(vm);
 504
 505                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
 506                 vm_map_lock(&vm->vm_map);
 507                 cpu_vmspace_free(vm);
 508
 509                 /*
 510                  * Lock the map, to wait out all other references to it.
 511                  * Delete all of the mappings and pages they hold, then call
 512                  * the pmap module to reclaim anything left.
 513                  */
 514                 vm_map_delete(&vm->vm_map, vm->vm_map.header.start,
 515                               vm->vm_map.header.end, &count);
 516                 vm_map_unlock(&vm->vm_map);
 517                 vm_map_entry_release(count);
 518
 519                 pmap_release(vmspace_pmap(vm));
 520                 lwkt_reltoken(&vm->vm_map.token);
 521                 objcache_put(vmspace_cache, vm);
 522         }
 523 }
 524
 525 /*
 526  * Swap useage is determined by taking the proportional swap used by
 527  * VM objects backing the VM map.  To make up for fractional losses,
 528  * if the VM object has any swap use at all the associated map entries
 529  * count for at least 1 swap page.
 530  *
 531  * No requirements.
 532  */
 533 vm_offset_t
 534 vmspace_swap_count(struct vmspace *vm)
 535 {
 536         vm_map_t map = &vm->vm_map;
 537         vm_map_entry_t cur;
 538         vm_object_t object;
 539         vm_offset_t count = 0;
 540         vm_offset_t n;
 541
 542         vmspace_hold(vm);
 543         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 544                 switch(cur->maptype) {
 545                 case VM_MAPTYPE_NORMAL:
 546                 case VM_MAPTYPE_VPAGETABLE:
 547                         if ((object = cur->object.vm_object) == NULL)
 548                                 break;
 549                         if (object->swblock_count) {
 550                                 n = (cur->end - cur->start) / PAGE_SIZE;
 551                                 count += object->swblock_count *
 552                                     SWAP_META_PAGES * n / object->size + 1;
 553                         }
 554                         break;
 555                 default:
 556                         break;
 557                 }
 558         }
 559         vmspace_drop(vm);
 560
 561         return(count);
 562 }
 563
 564 /*
 565  * Calculate the approximate number of anonymous pages in use by
 566  * this vmspace.  To make up for fractional losses, we count each
 567  * VM object as having at least 1 anonymous page.
 568  *
 569  * No requirements.
 570  */
 571 vm_offset_t
 572 vmspace_anonymous_count(struct vmspace *vm)
 573 {
 574         vm_map_t map = &vm->vm_map;
 575         vm_map_entry_t cur;
 576         vm_object_t object;
 577         vm_offset_t count = 0;
 578
 579         vmspace_hold(vm);
 580         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 581                 switch(cur->maptype) {
 582                 case VM_MAPTYPE_NORMAL:
 583                 case VM_MAPTYPE_VPAGETABLE:
 584                         if ((object = cur->object.vm_object) == NULL)
 585                                 break;
 586                         if (object->type != OBJT_DEFAULT &&
 587                             object->type != OBJT_SWAP) {
 588                                 break;
 589                         }
 590                         count += object->resident_page_count;
 591                         break;
 592                 default:
 593                         break;
 594                 }
 595         }
 596         vmspace_drop(vm);
 597
 598         return(count);
 599 }
 600
 601 /*
 602  * Initialize an existing vm_map structure such as that in the vmspace
 603  * structure.  The pmap is initialized elsewhere.
 604  *
 605  * No requirements.
 606  */
 607 void
 608 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap)
 609 {
 610         map->header.next = map->header.prev = &map->header;
 611         RB_INIT(&map->rb_root);
 612         spin_init(&map->ilock_spin, "ilock");
 613         map->ilock_base = NULL;
 614         map->nentries = 0;
 615         map->size = 0;
 616         map->system_map = 0;
 617         map->header.start = min;
 618         map->header.end = max;
 619         map->pmap = pmap;
 620         map->timestamp = 0;
 621         map->flags = 0;
 622         bzero(&map->freehint, sizeof(map->freehint));
 623         lwkt_token_init(&map->token, "vm_map");
 624         lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
 625 }
 626
 627 /*
 628  * Find the first possible free address for the specified request length.
 629  * Returns 0 if we don't have one cached.
 630  */
 631 static
 632 vm_offset_t
 633 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
 634 {
 635         vm_map_freehint_t *scan;
 636
 637         scan = &map->freehint[0];
 638         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 639                 if (scan->length == length && scan->align == align)
 640                         return(scan->start);
 641                 ++scan;
 642         }
 643         return 0;
 644 }
 645
 646 /*
 647  * Unconditionally set the freehint.  Called by vm_map_findspace() after
 648  * it finds an address.  This will help us iterate optimally on the next
 649  * similar findspace.
 650  */
 651 static
 652 void
 653 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
 654                        vm_size_t length, vm_size_t align)
 655 {
 656         vm_map_freehint_t *scan;
 657
 658         scan = &map->freehint[0];
 659         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 660                 if (scan->length == length && scan->align == align) {
 661                         scan->start = start;
 662                         return;
 663                 }
 664                 ++scan;
 665         }
 666         scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
 667         scan->start = start;
 668         scan->align = align;
 669         scan->length = length;
 670         ++map->freehint_newindex;
 671 }
 672
 673 /*
 674  * Update any existing freehints (for any alignment), for the hole we just
 675  * added.
 676  */
 677 static
 678 void
 679 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
 680 {
 681         vm_map_freehint_t *scan;
 682
 683         scan = &map->freehint[0];
 684         while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
 685                 if (scan->length <= length && scan->start > start)
 686                         scan->start = start;
 687                 ++scan;
 688         }
 689 }
 690
 691 /*
 692  * Shadow the vm_map_entry's object.  This typically needs to be done when
 693  * a write fault is taken on an entry which had previously been cloned by
 694  * fork().  The shared object (which might be NULL) must become private so
 695  * we add a shadow layer above it.
 696  *
 697  * Object allocation for anonymous mappings is defered as long as possible.
 698  * When creating a shadow, however, the underlying object must be instantiated
 699  * so it can be shared.
 700  *
 701  * If the map segment is governed by a virtual page table then it is
 702  * possible to address offsets beyond the mapped area.  Just allocate
 703  * a maximally sized object for this case.
 704  *
 705  * If addref is non-zero an additional reference is added to the returned
 706  * entry.  This mechanic exists because the additional reference might have
 707  * to be added atomically and not after return to prevent a premature
 708  * collapse.
 709  *
 710  * The vm_map must be exclusively locked.
 711  * No other requirements.
 712  */
 713 static
 714 void
 715 vm_map_entry_shadow(vm_map_entry_t entry, int addref)
 716 {
 717         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 718                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 719                                  0x7FFFFFFF, addref);   /* XXX */
 720         } else {
 721                 vm_object_shadow(&entry->object.vm_object, &entry->offset,
 722                                  atop(entry->end - entry->start), addref);
 723         }
 724         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 725 }
 726
 727 /*
 728  * Allocate an object for a vm_map_entry.
 729  *
 730  * Object allocation for anonymous mappings is defered as long as possible.
 731  * This function is called when we can defer no longer, generally when a map
 732  * entry might be split or forked or takes a page fault.
 733  *
 734  * If the map segment is governed by a virtual page table then it is
 735  * possible to address offsets beyond the mapped area.  Just allocate
 736  * a maximally sized object for this case.
 737  *
 738  * The vm_map must be exclusively locked.
 739  * No other requirements.
 740  */
 741 void
 742 vm_map_entry_allocate_object(vm_map_entry_t entry)
 743 {
 744         vm_object_t obj;
 745
 746         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 747                 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */
 748         } else {
 749                 obj = vm_object_allocate(OBJT_DEFAULT,
 750                                          atop(entry->end - entry->start));
 751         }
 752         entry->object.vm_object = obj;
 753         entry->offset = 0;
 754 }
 755
 756 /*
 757  * Set an initial negative count so the first attempt to reserve
 758  * space preloads a bunch of vm_map_entry's for this cpu.  Also
 759  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
 760  * map a new page for vm_map_entry structures.  SMP systems are
 761  * particularly sensitive.
 762  *
 763  * This routine is called in early boot so we cannot just call
 764  * vm_map_entry_reserve().
 765  *
 766  * Called from the low level boot code only (for each cpu)
 767  *
 768  * WARNING! Take care not to have too-big a static/BSS structure here
 769  *          as MAXCPU can be 256+, otherwise the loader's 64MB heap
 770  *          can get blown out by the kernel plus the initrd image.
 771  */
 772 void
 773 vm_map_entry_reserve_cpu_init(globaldata_t gd)
 774 {
 775         vm_map_entry_t entry;
 776         int count;
 777         int i;
 778
 779         atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
 780         if (gd->gd_cpuid == 0) {
 781                 entry = &cpu_map_entry_init_bsp[0];
 782                 count = MAPENTRYBSP_CACHE;
 783         } else {
 784                 entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
 785                 count = MAPENTRYAP_CACHE;
 786         }
 787         for (i = 0; i < count; ++i, ++entry) {
 788                 entry->next = gd->gd_vme_base;
 789                 gd->gd_vme_base = entry;
 790         }
 791 }
 792
 793 /*
 794  * Reserves vm_map_entry structures so code later-on can manipulate
 795  * map_entry structures within a locked map without blocking trying
 796  * to allocate a new vm_map_entry.
 797  *
 798  * No requirements.
 799  *
 800  * WARNING!  We must not decrement gd_vme_avail until after we have
 801  *           ensured that sufficient entries exist, otherwise we can
 802  *           get into an endless call recursion in the zalloc code
 803  *           itself.
 804  */
 805 int
 806 vm_map_entry_reserve(int count)
 807 {
 808         struct globaldata *gd = mycpu;
 809         vm_map_entry_t entry;
 810
 811         /*
 812          * Make sure we have enough structures in gd_vme_base to handle
 813          * the reservation request.
 814          *
 815          * Use a critical section to protect against VM faults.  It might
 816          * not be needed, but we have to be careful here.
 817          */
 818         if (gd->gd_vme_avail < count) {
 819                 crit_enter();
 820                 while (gd->gd_vme_avail < count) {
 821                         entry = zalloc(mapentzone);
 822                         entry->next = gd->gd_vme_base;
 823                         gd->gd_vme_base = entry;
 824                         atomic_add_int(&gd->gd_vme_avail, 1);
 825                 }
 826                 crit_exit();
 827         }
 828         atomic_add_int(&gd->gd_vme_avail, -count);
 829
 830         return(count);
 831 }
 832
 833 /*
 834  * Releases previously reserved vm_map_entry structures that were not
 835  * used.  If we have too much junk in our per-cpu cache clean some of
 836  * it out.
 837  *
 838  * No requirements.
 839  */
 840 void
 841 vm_map_entry_release(int count)
 842 {
 843         struct globaldata *gd = mycpu;
 844         vm_map_entry_t entry;
 845         vm_map_entry_t efree;
 846
 847         count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
 848         if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
 849                 efree = NULL;
 850                 crit_enter();
 851                 while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
 852                         entry = gd->gd_vme_base;
 853                         KKASSERT(entry != NULL);
 854                         gd->gd_vme_base = entry->next;
 855                         atomic_add_int(&gd->gd_vme_avail, -1);
 856                         entry->next = efree;
 857                         efree = entry;
 858                 }
 859                 crit_exit();
 860                 while ((entry = efree) != NULL) {
 861                         efree = efree->next;
 862                         zfree(mapentzone, entry);
 863                 }
 864         }
 865 }
 866
 867 /*
 868  * Reserve map entry structures for use in kernel_map itself.  These
 869  * entries have *ALREADY* been reserved on a per-cpu basis when the map
 870  * was inited.  This function is used by zalloc() to avoid a recursion
 871  * when zalloc() itself needs to allocate additional kernel memory.
 872  *
 873  * This function works like the normal reserve but does not load the
 874  * vm_map_entry cache (because that would result in an infinite
 875  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
 876  *
 877  * Any caller of this function must be sure to renormalize after
 878  * potentially eating entries to ensure that the reserve supply
 879  * remains intact.
 880  *
 881  * No requirements.
 882  */
 883 int
 884 vm_map_entry_kreserve(int count)
 885 {
 886         struct globaldata *gd = mycpu;
 887
 888         atomic_add_int(&gd->gd_vme_avail, -count);
 889         KASSERT(gd->gd_vme_base != NULL,
 890                 ("no reserved entries left, gd_vme_avail = %d",
 891                 gd->gd_vme_avail));
 892         return(count);
 893 }
 894
 895 /*
 896  * Release previously reserved map entries for kernel_map.  We do not
 897  * attempt to clean up like the normal release function as this would
 898  * cause an unnecessary (but probably not fatal) deep procedure call.
 899  *
 900  * No requirements.
 901  */
 902 void
 903 vm_map_entry_krelease(int count)
 904 {
 905         struct globaldata *gd = mycpu;
 906
 907         atomic_add_int(&gd->gd_vme_avail, count);
 908 }
 909
 910 /*
 911  * Allocates a VM map entry for insertion.  No entry fields are filled in.
 912  *
 913  * The entries should have previously been reserved.  The reservation count
 914  * is tracked in (*countp).
 915  *
 916  * No requirements.
 917  */
 918 static vm_map_entry_t
 919 vm_map_entry_create(vm_map_t map, int *countp)
 920 {
 921         struct globaldata *gd = mycpu;
 922         vm_map_entry_t entry;
 923
 924         KKASSERT(*countp > 0);
 925         --*countp;
 926         crit_enter();
 927         entry = gd->gd_vme_base;
 928         KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
 929         gd->gd_vme_base = entry->next;
 930         crit_exit();
 931
 932         return(entry);
 933 }
 934
 935 /*
 936  * Dispose of a vm_map_entry that is no longer being referenced.
 937  *
 938  * No requirements.
 939  */
 940 static void
 941 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
 942 {
 943         struct globaldata *gd = mycpu;
 944
 945         ++*countp;
 946         crit_enter();
 947         entry->next = gd->gd_vme_base;
 948         gd->gd_vme_base = entry;
 949         crit_exit();
 950 }
 951
 952
 953 /*
 954  * Insert/remove entries from maps.
 955  *
 956  * The related map must be exclusively locked.
 957  * The caller must hold map->token
 958  * No other requirements.
 959  */
 960 static __inline void
 961 vm_map_entry_link(vm_map_t map,
 962                   vm_map_entry_t after_where,
 963                   vm_map_entry_t entry)
 964 {
 965         ASSERT_VM_MAP_LOCKED(map);
 966
 967         map->nentries++;
 968         entry->prev = after_where;
 969         entry->next = after_where->next;
 970         entry->next->prev = entry;
 971         after_where->next = entry;
 972         if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
 973                 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
 974 }
 975
 976 static __inline void
 977 vm_map_entry_unlink(vm_map_t map,
 978                     vm_map_entry_t entry)
 979 {
 980         vm_map_entry_t prev;
 981         vm_map_entry_t next;
 982
 983         ASSERT_VM_MAP_LOCKED(map);
 984
 985         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 986                 panic("vm_map_entry_unlink: attempt to mess with "
 987                       "locked entry! %p", entry);
 988         }
 989         prev = entry->prev;
 990         next = entry->next;
 991         next->prev = prev;
 992         prev->next = next;
 993         vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
 994         map->nentries--;
 995 }
 996
 997 /*
 998  * Finds the map entry containing (or immediately preceding) the specified
 999  * address in the given map.  The entry is returned in (*entry).
1000  *
1001  * The boolean result indicates whether the address is actually contained
1002  * in the map.
1003  *
1004  * The related map must be locked.
1005  * No other requirements.
1006  */
1007 boolean_t
1008 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1009 {
1010         vm_map_entry_t tmp;
1011         vm_map_entry_t last;
1012
1013         ASSERT_VM_MAP_LOCKED(map);
1014
1015         /*
1016          * Locate the record from the top of the tree.  'last' tracks the
1017          * closest prior record and is returned if no match is found, which
1018          * in binary tree terms means tracking the most recent right-branch
1019          * taken.  If there is no prior record, &map->header is returned.
1020          */
1021         last = &map->header;
1022         tmp = RB_ROOT(&map->rb_root);
1023
1024         while (tmp) {
1025                 if (address >= tmp->start) {
1026                         if (address < tmp->end) {
1027                                 *entry = tmp;
1028                                 return(TRUE);
1029                         }
1030                         last = tmp;
1031                         tmp = RB_RIGHT(tmp, rb_entry);
1032                 } else {
1033                         tmp = RB_LEFT(tmp, rb_entry);
1034                 }
1035         }
1036         *entry = last;
1037         return (FALSE);
1038 }
1039
1040 /*
1041  * Inserts the given whole VM object into the target map at the specified
1042  * address range.  The object's size should match that of the address range.
1043  *
1044  * The map must be exclusively locked.
1045  * The object must be held.
1046  * The caller must have reserved sufficient vm_map_entry structures.
1047  *
1048  * If object is non-NULL, ref count must be bumped by caller prior to
1049  * making call to account for the new entry.
1050  */
1051 int
1052 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
1053               vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
1054               vm_maptype_t maptype, vm_subsys_t id,
1055               vm_prot_t prot, vm_prot_t max, int cow)
1056 {
1057         vm_map_entry_t new_entry;
1058         vm_map_entry_t prev_entry;
1059         vm_map_entry_t temp_entry;
1060         vm_eflags_t protoeflags;
1061         int must_drop = 0;
1062         vm_object_t object;
1063
1064         if (maptype == VM_MAPTYPE_UKSMAP)
1065                 object = NULL;
1066         else
1067                 object = map_object;
1068
1069         ASSERT_VM_MAP_LOCKED(map);
1070         if (object)
1071                 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1072
1073         /*
1074          * Check that the start and end points are not bogus.
1075          */
1076         if ((start < map->header.start) || (end > map->header.end) ||
1077             (start >= end))
1078                 return (KERN_INVALID_ADDRESS);
1079
1080         /*
1081          * Find the entry prior to the proposed starting address; if it's part
1082          * of an existing entry, this range is bogus.
1083          */
1084         if (vm_map_lookup_entry(map, start, &temp_entry))
1085                 return (KERN_NO_SPACE);
1086
1087         prev_entry = temp_entry;
1088
1089         /*
1090          * Assert that the next entry doesn't overlap the end point.
1091          */
1092
1093         if ((prev_entry->next != &map->header) &&
1094             (prev_entry->next->start < end))
1095                 return (KERN_NO_SPACE);
1096
1097         protoeflags = 0;
1098
1099         if (cow & MAP_COPY_ON_WRITE)
1100                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1101
1102         if (cow & MAP_NOFAULT) {
1103                 protoeflags |= MAP_ENTRY_NOFAULT;
1104
1105                 KASSERT(object == NULL,
1106                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1107         }
1108         if (cow & MAP_DISABLE_SYNCER)
1109                 protoeflags |= MAP_ENTRY_NOSYNC;
1110         if (cow & MAP_DISABLE_COREDUMP)
1111                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1112         if (cow & MAP_IS_STACK)
1113                 protoeflags |= MAP_ENTRY_STACK;
1114         if (cow & MAP_IS_KSTACK)
1115                 protoeflags |= MAP_ENTRY_KSTACK;
1116
1117         lwkt_gettoken(&map->token);
1118
1119         if (object) {
1120                 /*
1121                  * When object is non-NULL, it could be shared with another
1122                  * process.  We have to set or clear OBJ_ONEMAPPING
1123                  * appropriately.
1124                  *
1125                  * NOTE: This flag is only applicable to DEFAULT and SWAP
1126                  *       objects and will already be clear in other types
1127                  *       of objects, so a shared object lock is ok for
1128                  *       VNODE objects.
1129                  */
1130                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
1131                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1132                 }
1133         }
1134         else if ((prev_entry != &map->header) &&
1135                  (prev_entry->eflags == protoeflags) &&
1136                  (prev_entry->end == start) &&
1137                  (prev_entry->wired_count == 0) &&
1138                  (prev_entry->id == id) &&
1139                  prev_entry->maptype == maptype &&
1140                  maptype == VM_MAPTYPE_NORMAL &&
1141                  ((prev_entry->object.vm_object == NULL) ||
1142                   vm_object_coalesce(prev_entry->object.vm_object,
1143                                      OFF_TO_IDX(prev_entry->offset),
1144                                      (vm_size_t)(prev_entry->end - prev_entry->start),
1145                                      (vm_size_t)(end - prev_entry->end)))) {
1146                 /*
1147                  * We were able to extend the object.  Determine if we
1148                  * can extend the previous map entry to include the
1149                  * new range as well.
1150                  */
1151                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1152                     (prev_entry->protection == prot) &&
1153                     (prev_entry->max_protection == max)) {
1154                         map->size += (end - prev_entry->end);
1155                         prev_entry->end = end;
1156                         vm_map_simplify_entry(map, prev_entry, countp);
1157                         lwkt_reltoken(&map->token);
1158                         return (KERN_SUCCESS);
1159                 }
1160
1161                 /*
1162                  * If we can extend the object but cannot extend the
1163                  * map entry, we have to create a new map entry.  We
1164                  * must bump the ref count on the extended object to
1165                  * account for it.  object may be NULL.
1166                  *
1167                  * XXX if object is NULL should we set offset to 0 here ?
1168                  */
1169                 object = prev_entry->object.vm_object;
1170                 offset = prev_entry->offset +
1171                         (prev_entry->end - prev_entry->start);
1172                 if (object) {
1173                         vm_object_hold(object);
1174                         vm_object_chain_wait(object, 0);
1175                         vm_object_reference_locked(object);
1176                         must_drop = 1;
1177                         map_object = object;
1178                 }
1179         }
1180
1181         /*
1182          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1183          * in things like the buffer map where we manage kva but do not manage
1184          * backing objects.
1185          */
1186
1187         /*
1188          * Create a new entry
1189          */
1190
1191         new_entry = vm_map_entry_create(map, countp);
1192         new_entry->start = start;
1193         new_entry->end = end;
1194         new_entry->id = id;
1195
1196         new_entry->maptype = maptype;
1197         new_entry->eflags = protoeflags;
1198         new_entry->object.map_object = map_object;
1199         new_entry->aux.master_pde = 0;          /* in case size is different */
1200         new_entry->aux.map_aux = map_aux;
1201         new_entry->offset = offset;
1202
1203         new_entry->inheritance = VM_INHERIT_DEFAULT;
1204         new_entry->protection = prot;
1205         new_entry->max_protection = max;
1206         new_entry->wired_count = 0;
1207
1208         /*
1209          * Insert the new entry into the list
1210          */
1211
1212         vm_map_entry_link(map, prev_entry, new_entry);
1213         map->size += new_entry->end - new_entry->start;
1214
1215         /*
1216          * Don't worry about updating freehint[] when inserting, allow
1217          * addresses to be lower than the actual first free spot.
1218          */
1219 #if 0
1220         /*
1221          * Temporarily removed to avoid MAP_STACK panic, due to
1222          * MAP_STACK being a huge hack.  Will be added back in
1223          * when MAP_STACK (and the user stack mapping) is fixed.
1224          */
1225         /*
1226          * It may be possible to simplify the entry
1227          */
1228         vm_map_simplify_entry(map, new_entry, countp);
1229 #endif
1230
1231         /*
1232          * Try to pre-populate the page table.  Mappings governed by virtual
1233          * page tables cannot be prepopulated without a lot of work, so
1234          * don't try.
1235          */
1236         if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1237             maptype != VM_MAPTYPE_VPAGETABLE &&
1238             maptype != VM_MAPTYPE_UKSMAP) {
1239                 int dorelock = 0;
1240                 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1241                         dorelock = 1;
1242                         vm_object_lock_swap();
1243                         vm_object_drop(object);
1244                 }
1245                 pmap_object_init_pt(map->pmap, start, prot,
1246                                     object, OFF_TO_IDX(offset), end - start,
1247                                     cow & MAP_PREFAULT_PARTIAL);
1248                 if (dorelock) {
1249                         vm_object_hold(object);
1250                         vm_object_lock_swap();
1251                 }
1252         }
1253         if (must_drop)
1254                 vm_object_drop(object);
1255
1256         lwkt_reltoken(&map->token);
1257         return (KERN_SUCCESS);
1258 }
1259
1260 /*
1261  * Find sufficient space for `length' bytes in the given map, starting at
1262  * `start'.  Returns 0 on success, 1 on no space.
1263  *
1264  * This function will returned an arbitrarily aligned pointer.  If no
1265  * particular alignment is required you should pass align as 1.  Note that
1266  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1267  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1268  * argument.
1269  *
1270  * 'align' should be a power of 2 but is not required to be.
1271  *
1272  * The map must be exclusively locked.
1273  * No other requirements.
1274  */
1275 int
1276 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1277                  vm_size_t align, int flags, vm_offset_t *addr)
1278 {
1279         vm_map_entry_t entry, next;
1280         vm_map_entry_t tmp;
1281         vm_offset_t hole_start;
1282         vm_offset_t end;
1283         vm_offset_t align_mask;
1284
1285         if (start < map->header.start)
1286                 start = map->header.start;
1287         if (start > map->header.end)
1288                 return (1);
1289
1290         /*
1291          * If the alignment is not a power of 2 we will have to use
1292          * a mod/division, set align_mask to a special value.
1293          */
1294         if ((align | (align - 1)) + 1 != (align << 1))
1295                 align_mask = (vm_offset_t)-1;
1296         else
1297                 align_mask = align - 1;
1298
1299         /*
1300          * Use freehint to adjust the start point, hopefully reducing
1301          * the iteration to O(1).
1302          */
1303         hole_start = vm_map_freehint_find(map, length, align);
1304         if (start < hole_start)
1305                 start = hole_start;
1306         if (vm_map_lookup_entry(map, start, &tmp))
1307                 start = tmp->end;
1308         entry = tmp;
1309
1310         /*
1311          * Look through the rest of the map, trying to fit a new region in the
1312          * gap between existing regions, or after the very last region.
1313          */
1314         for (;; start = (entry = next)->end) {
1315                 /*
1316                  * Adjust the proposed start by the requested alignment,
1317                  * be sure that we didn't wrap the address.
1318                  */
1319                 if (align_mask == (vm_offset_t)-1)
1320                         end = roundup(start, align);
1321                 else
1322                         end = (start + align_mask) & ~align_mask;
1323                 if (end < start)
1324                         return (1);
1325                 start = end;
1326
1327                 /*
1328                  * Find the end of the proposed new region.  Be sure we didn't
1329                  * go beyond the end of the map, or wrap around the address.
1330                  * Then check to see if this is the last entry or if the
1331                  * proposed end fits in the gap between this and the next
1332                  * entry.
1333                  */
1334                 end = start + length;
1335                 if (end > map->header.end || end < start)
1336                         return (1);
1337                 next = entry->next;
1338
1339                 /*
1340                  * If the next entry's start address is beyond the desired
1341                  * end address we may have found a good entry.
1342                  *
1343                  * If the next entry is a stack mapping we do not map into
1344                  * the stack's reserved space.
1345                  *
1346                  * XXX continue to allow mapping into the stack's reserved
1347                  * space if doing a MAP_STACK mapping inside a MAP_STACK
1348                  * mapping, for backwards compatibility.  But the caller
1349                  * really should use MAP_STACK | MAP_TRYFIXED if they
1350                  * want to do that.
1351                  */
1352                 if (next == &map->header)
1353                         break;
1354                 if (next->start >= end) {
1355                         if ((next->eflags & MAP_ENTRY_STACK) == 0)
1356                                 break;
1357                         if (flags & MAP_STACK)
1358                                 break;
1359                         if (next->start - next->aux.avail_ssize >= end)
1360                                 break;
1361                 }
1362         }
1363
1364         /*
1365          * Update the freehint
1366          */
1367         vm_map_freehint_update(map, start, length, align);
1368
1369         /*
1370          * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1371          * if it fails.  The kernel_map is locked and nothing can steal
1372          * our address space if pmap_growkernel() blocks.
1373          *
1374          * NOTE: This may be unconditionally called for kldload areas on
1375          *       x86_64 because these do not bump kernel_vm_end (which would
1376          *       fill 128G worth of page tables!).  Therefore we must not
1377          *       retry.
1378          */
1379         if (map == &kernel_map) {
1380                 vm_offset_t kstop;
1381
1382                 kstop = round_page(start + length);
1383                 if (kstop > kernel_vm_end)
1384                         pmap_growkernel(start, kstop);
1385         }
1386         *addr = start;
1387         return (0);
1388 }
1389
1390 /*
1391  * vm_map_find finds an unallocated region in the target address map with
1392  * the given length and allocates it.  The search is defined to be first-fit
1393  * from the specified address; the region found is returned in the same
1394  * parameter.
1395  *
1396  * If object is non-NULL, ref count must be bumped by caller
1397  * prior to making call to account for the new entry.
1398  *
1399  * No requirements.  This function will lock the map temporarily.
1400  */
1401 int
1402 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1403             vm_ooffset_t offset, vm_offset_t *addr,
1404             vm_size_t length, vm_size_t align, boolean_t fitit,
1405             vm_maptype_t maptype, vm_subsys_t id,
1406             vm_prot_t prot, vm_prot_t max, int cow)
1407 {
1408         vm_offset_t start;
1409         vm_object_t object;
1410         int result;
1411         int count;
1412
1413         if (maptype == VM_MAPTYPE_UKSMAP)
1414                 object = NULL;
1415         else
1416                 object = map_object;
1417
1418         start = *addr;
1419
1420         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1421         vm_map_lock(map);
1422         if (object)
1423                 vm_object_hold_shared(object);
1424         if (fitit) {
1425                 if (vm_map_findspace(map, start, length, align, 0, addr)) {
1426                         if (object)
1427                                 vm_object_drop(object);
1428                         vm_map_unlock(map);
1429                         vm_map_entry_release(count);
1430                         return (KERN_NO_SPACE);
1431                 }
1432                 start = *addr;
1433         }
1434         result = vm_map_insert(map, &count, map_object, map_aux,
1435                                offset, start, start + length,
1436                                maptype, id, prot, max, cow);
1437         if (object)
1438                 vm_object_drop(object);
1439         vm_map_unlock(map);
1440         vm_map_entry_release(count);
1441
1442         return (result);
1443 }
1444
1445 /*
1446  * Simplify the given map entry by merging with either neighbor.  This
1447  * routine also has the ability to merge with both neighbors.
1448  *
1449  * This routine guarentees that the passed entry remains valid (though
1450  * possibly extended).  When merging, this routine may delete one or
1451  * both neighbors.  No action is taken on entries which have their
1452  * in-transition flag set.
1453  *
1454  * The map must be exclusively locked.
1455  */
1456 void
1457 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1458 {
1459         vm_map_entry_t next, prev;
1460         vm_size_t prevsize, esize;
1461
1462         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1463                 ++mycpu->gd_cnt.v_intrans_coll;
1464                 return;
1465         }
1466
1467         if (entry->maptype == VM_MAPTYPE_SUBMAP)
1468                 return;
1469         if (entry->maptype == VM_MAPTYPE_UKSMAP)
1470                 return;
1471
1472         prev = entry->prev;
1473         if (prev != &map->header) {
1474                 prevsize = prev->end - prev->start;
1475                 if ( (prev->end == entry->start) &&
1476                      (prev->maptype == entry->maptype) &&
1477                      (prev->object.vm_object == entry->object.vm_object) &&
1478                      (!prev->object.vm_object ||
1479                         (prev->offset + prevsize == entry->offset)) &&
1480                      (prev->eflags == entry->eflags) &&
1481                      (prev->protection == entry->protection) &&
1482                      (prev->max_protection == entry->max_protection) &&
1483                      (prev->inheritance == entry->inheritance) &&
1484                      (prev->id == entry->id) &&
1485                      (prev->wired_count == entry->wired_count)) {
1486                         vm_map_entry_unlink(map, prev);
1487                         entry->start = prev->start;
1488                         entry->offset = prev->offset;
1489                         if (prev->object.vm_object)
1490                                 vm_object_deallocate(prev->object.vm_object);
1491                         vm_map_entry_dispose(map, prev, countp);
1492                 }
1493         }
1494
1495         next = entry->next;
1496         if (next != &map->header) {
1497                 esize = entry->end - entry->start;
1498                 if ((entry->end == next->start) &&
1499                     (next->maptype == entry->maptype) &&
1500                     (next->object.vm_object == entry->object.vm_object) &&
1501                      (!entry->object.vm_object ||
1502                         (entry->offset + esize == next->offset)) &&
1503                     (next->eflags == entry->eflags) &&
1504                     (next->protection == entry->protection) &&
1505                     (next->max_protection == entry->max_protection) &&
1506                     (next->inheritance == entry->inheritance) &&
1507                     (next->id == entry->id) &&
1508                     (next->wired_count == entry->wired_count)) {
1509                         vm_map_entry_unlink(map, next);
1510                         entry->end = next->end;
1511                         if (next->object.vm_object)
1512                                 vm_object_deallocate(next->object.vm_object);
1513                         vm_map_entry_dispose(map, next, countp);
1514                 }
1515         }
1516 }
1517
1518 /*
1519  * Asserts that the given entry begins at or after the specified address.
1520  * If necessary, it splits the entry into two.
1521  */
1522 #define vm_map_clip_start(map, entry, startaddr, countp)                \
1523 {                                                                       \
1524         if (startaddr > entry->start)                                   \
1525                 _vm_map_clip_start(map, entry, startaddr, countp);      \
1526 }
1527
1528 /*
1529  * This routine is called only when it is known that the entry must be split.
1530  *
1531  * The map must be exclusively locked.
1532  */
1533 static void
1534 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1535                    int *countp)
1536 {
1537         vm_map_entry_t new_entry;
1538
1539         /*
1540          * Split off the front portion -- note that we must insert the new
1541          * entry BEFORE this one, so that this entry has the specified
1542          * starting address.
1543          */
1544
1545         vm_map_simplify_entry(map, entry, countp);
1546
1547         /*
1548          * If there is no object backing this entry, we might as well create
1549          * one now.  If we defer it, an object can get created after the map
1550          * is clipped, and individual objects will be created for the split-up
1551          * map.  This is a bit of a hack, but is also about the best place to
1552          * put this improvement.
1553          */
1554         if (entry->object.vm_object == NULL && !map->system_map &&
1555             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1556                 vm_map_entry_allocate_object(entry);
1557         }
1558
1559         new_entry = vm_map_entry_create(map, countp);
1560         *new_entry = *entry;
1561
1562         new_entry->end = start;
1563         entry->offset += (start - entry->start);
1564         entry->start = start;
1565
1566         vm_map_entry_link(map, entry->prev, new_entry);
1567
1568         switch(entry->maptype) {
1569         case VM_MAPTYPE_NORMAL:
1570         case VM_MAPTYPE_VPAGETABLE:
1571                 if (new_entry->object.vm_object) {
1572                         vm_object_hold(new_entry->object.vm_object);
1573                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1574                         vm_object_reference_locked(new_entry->object.vm_object);
1575                         vm_object_drop(new_entry->object.vm_object);
1576                 }
1577                 break;
1578         default:
1579                 break;
1580         }
1581 }
1582
1583 /*
1584  * Asserts that the given entry ends at or before the specified address.
1585  * If necessary, it splits the entry into two.
1586  *
1587  * The map must be exclusively locked.
1588  */
1589 #define vm_map_clip_end(map, entry, endaddr, countp)            \
1590 {                                                               \
1591         if (endaddr < entry->end)                               \
1592                 _vm_map_clip_end(map, entry, endaddr, countp);  \
1593 }
1594
1595 /*
1596  * This routine is called only when it is known that the entry must be split.
1597  *
1598  * The map must be exclusively locked.
1599  */
1600 static void
1601 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1602                  int *countp)
1603 {
1604         vm_map_entry_t new_entry;
1605
1606         /*
1607          * If there is no object backing this entry, we might as well create
1608          * one now.  If we defer it, an object can get created after the map
1609          * is clipped, and individual objects will be created for the split-up
1610          * map.  This is a bit of a hack, but is also about the best place to
1611          * put this improvement.
1612          */
1613
1614         if (entry->object.vm_object == NULL && !map->system_map &&
1615             VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1616                 vm_map_entry_allocate_object(entry);
1617         }
1618
1619         /*
1620          * Create a new entry and insert it AFTER the specified entry
1621          */
1622         new_entry = vm_map_entry_create(map, countp);
1623         *new_entry = *entry;
1624
1625         new_entry->start = entry->end = end;
1626         new_entry->offset += (end - entry->start);
1627
1628         vm_map_entry_link(map, entry, new_entry);
1629
1630         switch(entry->maptype) {
1631         case VM_MAPTYPE_NORMAL:
1632         case VM_MAPTYPE_VPAGETABLE:
1633                 if (new_entry->object.vm_object) {
1634                         vm_object_hold(new_entry->object.vm_object);
1635                         vm_object_chain_wait(new_entry->object.vm_object, 0);
1636                         vm_object_reference_locked(new_entry->object.vm_object);
1637                         vm_object_drop(new_entry->object.vm_object);
1638                 }
1639                 break;
1640         default:
1641                 break;
1642         }
1643 }
1644
1645 /*
1646  * Asserts that the starting and ending region addresses fall within the
1647  * valid range for the map.
1648  */
1649 #define VM_MAP_RANGE_CHECK(map, start, end)     \
1650 {                                               \
1651         if (start < vm_map_min(map))            \
1652                 start = vm_map_min(map);        \
1653         if (end > vm_map_max(map))              \
1654                 end = vm_map_max(map);          \
1655         if (start > end)                        \
1656                 start = end;                    \
1657 }
1658
1659 /*
1660  * Used to block when an in-transition collison occurs.  The map
1661  * is unlocked for the sleep and relocked before the return.
1662  */
1663 void
1664 vm_map_transition_wait(vm_map_t map, int relock)
1665 {
1666         tsleep_interlock(map, 0);
1667         vm_map_unlock(map);
1668         tsleep(map, PINTERLOCKED, "vment", 0);
1669         if (relock)
1670                 vm_map_lock(map);
1671 }
1672
1673 /*
1674  * When we do blocking operations with the map lock held it is
1675  * possible that a clip might have occured on our in-transit entry,
1676  * requiring an adjustment to the entry in our loop.  These macros
1677  * help the pageable and clip_range code deal with the case.  The
1678  * conditional costs virtually nothing if no clipping has occured.
1679  */
1680
1681 #define CLIP_CHECK_BACK(entry, save_start)              \
1682     do {                                                \
1683             while (entry->start != save_start) {        \
1684                     entry = entry->prev;                \
1685                     KASSERT(entry != &map->header, ("bad entry clip")); \
1686             }                                           \
1687     } while(0)
1688
1689 #define CLIP_CHECK_FWD(entry, save_end)                 \
1690     do {                                                \
1691             while (entry->end != save_end) {            \
1692                     entry = entry->next;                \
1693                     KASSERT(entry != &map->header, ("bad entry clip")); \
1694             }                                           \
1695     } while(0)
1696
1697
1698 /*
1699  * Clip the specified range and return the base entry.  The
1700  * range may cover several entries starting at the returned base
1701  * and the first and last entry in the covering sequence will be
1702  * properly clipped to the requested start and end address.
1703  *
1704  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1705  * flag.
1706  *
1707  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1708  * covered by the requested range.
1709  *
1710  * The map must be exclusively locked on entry and will remain locked
1711  * on return. If no range exists or the range contains holes and you
1712  * specified that no holes were allowed, NULL will be returned.  This
1713  * routine may temporarily unlock the map in order avoid a deadlock when
1714  * sleeping.
1715  */
1716 static
1717 vm_map_entry_t
1718 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1719                   int *countp, int flags)
1720 {
1721         vm_map_entry_t start_entry;
1722         vm_map_entry_t entry;
1723
1724         /*
1725          * Locate the entry and effect initial clipping.  The in-transition
1726          * case does not occur very often so do not try to optimize it.
1727          */
1728 again:
1729         if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1730                 return (NULL);
1731         entry = start_entry;
1732         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1733                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1734                 ++mycpu->gd_cnt.v_intrans_coll;
1735                 ++mycpu->gd_cnt.v_intrans_wait;
1736                 vm_map_transition_wait(map, 1);
1737                 /*
1738                  * entry and/or start_entry may have been clipped while
1739                  * we slept, or may have gone away entirely.  We have
1740                  * to restart from the lookup.
1741                  */
1742                 goto again;
1743         }
1744
1745         /*
1746          * Since we hold an exclusive map lock we do not have to restart
1747          * after clipping, even though clipping may block in zalloc.
1748          */
1749         vm_map_clip_start(map, entry, start, countp);
1750         vm_map_clip_end(map, entry, end, countp);
1751         entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1752
1753         /*
1754          * Scan entries covered by the range.  When working on the next
1755          * entry a restart need only re-loop on the current entry which
1756          * we have already locked, since 'next' may have changed.  Also,
1757          * even though entry is safe, it may have been clipped so we
1758          * have to iterate forwards through the clip after sleeping.
1759          */
1760         while (entry->next != &map->header && entry->next->start < end) {
1761                 vm_map_entry_t next = entry->next;
1762
1763                 if (flags & MAP_CLIP_NO_HOLES) {
1764                         if (next->start > entry->end) {
1765                                 vm_map_unclip_range(map, start_entry,
1766                                         start, entry->end, countp, flags);
1767                                 return(NULL);
1768                         }
1769                 }
1770
1771                 if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1772                         vm_offset_t save_end = entry->end;
1773                         next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1774                         ++mycpu->gd_cnt.v_intrans_coll;
1775                         ++mycpu->gd_cnt.v_intrans_wait;
1776                         vm_map_transition_wait(map, 1);
1777
1778                         /*
1779                          * clips might have occured while we blocked.
1780                          */
1781                         CLIP_CHECK_FWD(entry, save_end);
1782                         CLIP_CHECK_BACK(start_entry, start);
1783                         continue;
1784                 }
1785
1786                 /*
1787                  * No restart necessary even though clip_end may block, we
1788                  * are holding the map lock.
1789                  */
1790                 vm_map_clip_end(map, next, end, countp);
1791                 next->eflags |= MAP_ENTRY_IN_TRANSITION;
1792                 entry = next;
1793         }
1794         if (flags & MAP_CLIP_NO_HOLES) {
1795                 if (entry->end != end) {
1796                         vm_map_unclip_range(map, start_entry,
1797                                 start, entry->end, countp, flags);
1798                         return(NULL);
1799                 }
1800         }
1801         return(start_entry);
1802 }
1803
1804 /*
1805  * Undo the effect of vm_map_clip_range().  You should pass the same
1806  * flags and the same range that you passed to vm_map_clip_range().
1807  * This code will clear the in-transition flag on the entries and
1808  * wake up anyone waiting.  This code will also simplify the sequence
1809  * and attempt to merge it with entries before and after the sequence.
1810  *
1811  * The map must be locked on entry and will remain locked on return.
1812  *
1813  * Note that you should also pass the start_entry returned by
1814  * vm_map_clip_range().  However, if you block between the two calls
1815  * with the map unlocked please be aware that the start_entry may
1816  * have been clipped and you may need to scan it backwards to find
1817  * the entry corresponding with the original start address.  You are
1818  * responsible for this, vm_map_unclip_range() expects the correct
1819  * start_entry to be passed to it and will KASSERT otherwise.
1820  */
1821 static
1822 void
1823 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
1824                     vm_offset_t start, vm_offset_t end,
1825                     int *countp, int flags)
1826 {
1827         vm_map_entry_t entry;
1828
1829         entry = start_entry;
1830
1831         KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
1832         while (entry != &map->header && entry->start < end) {
1833                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1834                         ("in-transition flag not set during unclip on: %p",
1835                         entry));
1836                 KASSERT(entry->end <= end,
1837                         ("unclip_range: tail wasn't clipped"));
1838                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1839                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1840                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1841                         wakeup(map);
1842                 }
1843                 entry = entry->next;
1844         }
1845
1846         /*
1847          * Simplification does not block so there is no restart case.
1848          */
1849         entry = start_entry;
1850         while (entry != &map->header && entry->start < end) {
1851                 vm_map_simplify_entry(map, entry, countp);
1852                 entry = entry->next;
1853         }
1854 }
1855
1856 /*
1857  * Mark the given range as handled by a subordinate map.
1858  *
1859  * This range must have been created with vm_map_find(), and no other
1860  * operations may have been performed on this range prior to calling
1861  * vm_map_submap().
1862  *
1863  * Submappings cannot be removed.
1864  *
1865  * No requirements.
1866  */
1867 int
1868 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
1869 {
1870         vm_map_entry_t entry;
1871         int result = KERN_INVALID_ARGUMENT;
1872         int count;
1873
1874         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1875         vm_map_lock(map);
1876
1877         VM_MAP_RANGE_CHECK(map, start, end);
1878
1879         if (vm_map_lookup_entry(map, start, &entry)) {
1880                 vm_map_clip_start(map, entry, start, &count);
1881         } else {
1882                 entry = entry->next;
1883         }
1884
1885         vm_map_clip_end(map, entry, end, &count);
1886
1887         if ((entry->start == start) && (entry->end == end) &&
1888             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1889             (entry->object.vm_object == NULL)) {
1890                 entry->object.sub_map = submap;
1891                 entry->maptype = VM_MAPTYPE_SUBMAP;
1892                 result = KERN_SUCCESS;
1893         }
1894         vm_map_unlock(map);
1895         vm_map_entry_release(count);
1896
1897         return (result);
1898 }
1899
1900 /*
1901  * Sets the protection of the specified address region in the target map.
1902  * If "set_max" is specified, the maximum protection is to be set;
1903  * otherwise, only the current protection is affected.
1904  *
1905  * The protection is not applicable to submaps, but is applicable to normal
1906  * maps and maps governed by virtual page tables.  For example, when operating
1907  * on a virtual page table our protection basically controls how COW occurs
1908  * on the backing object, whereas the virtual page table abstraction itself
1909  * is an abstraction for userland.
1910  *
1911  * No requirements.
1912  */
1913 int
1914 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1915                vm_prot_t new_prot, boolean_t set_max)
1916 {
1917         vm_map_entry_t current;
1918         vm_map_entry_t entry;
1919         int count;
1920
1921         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1922         vm_map_lock(map);
1923
1924         VM_MAP_RANGE_CHECK(map, start, end);
1925
1926         if (vm_map_lookup_entry(map, start, &entry)) {
1927                 vm_map_clip_start(map, entry, start, &count);
1928         } else {
1929                 entry = entry->next;
1930         }
1931
1932         /*
1933          * Make a first pass to check for protection violations.
1934          */
1935         current = entry;
1936         while ((current != &map->header) && (current->start < end)) {
1937                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
1938                         vm_map_unlock(map);
1939                         vm_map_entry_release(count);
1940                         return (KERN_INVALID_ARGUMENT);
1941                 }
1942                 if ((new_prot & current->max_protection) != new_prot) {
1943                         vm_map_unlock(map);
1944                         vm_map_entry_release(count);
1945                         return (KERN_PROTECTION_FAILURE);
1946                 }
1947
1948                 /*
1949                  * When making a SHARED+RW file mmap writable, update
1950                  * v_lastwrite_ts.
1951                  */
1952                 if (new_prot & PROT_WRITE &&
1953                     (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
1954                     (current->maptype == VM_MAPTYPE_NORMAL ||
1955                      current->maptype == VM_MAPTYPE_VPAGETABLE) &&
1956                     current->object.vm_object &&
1957                     current->object.vm_object->type == OBJT_VNODE) {
1958                         struct vnode *vp;
1959
1960                         vp = current->object.vm_object->handle;
1961                         if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
1962                                 vfs_timestamp(&vp->v_lastwrite_ts);
1963                                 vsetflags(vp, VLASTWRITETS);
1964                                 vn_unlock(vp);
1965                         }
1966                 }
1967                 current = current->next;
1968         }
1969
1970         /*
1971          * Go back and fix up protections. [Note that clipping is not
1972          * necessary the second time.]
1973          */
1974         current = entry;
1975
1976         while ((current != &map->header) && (current->start < end)) {
1977                 vm_prot_t old_prot;
1978
1979                 vm_map_clip_end(map, current, end, &count);
1980
1981                 old_prot = current->protection;
1982                 if (set_max) {
1983                         current->max_protection = new_prot;
1984                         current->protection = new_prot & old_prot;
1985                 } else {
1986                         current->protection = new_prot;
1987                 }
1988
1989                 /*
1990                  * Update physical map if necessary. Worry about copy-on-write
1991                  * here -- CHECK THIS XXX
1992                  */
1993                 if (current->protection != old_prot) {
1994 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1995                                                         VM_PROT_ALL)
1996
1997                         pmap_protect(map->pmap, current->start,
1998                             current->end,
1999                             current->protection & MASK(current));
2000 #undef  MASK
2001                 }
2002
2003                 vm_map_simplify_entry(map, current, &count);
2004
2005                 current = current->next;
2006         }
2007         vm_map_unlock(map);
2008         vm_map_entry_release(count);
2009         return (KERN_SUCCESS);
2010 }
2011
2012 /*
2013  * This routine traverses a processes map handling the madvise
2014  * system call.  Advisories are classified as either those effecting
2015  * the vm_map_entry structure, or those effecting the underlying
2016  * objects.
2017  *
2018  * The <value> argument is used for extended madvise calls.
2019  *
2020  * No requirements.
2021  */
2022 int
2023 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2024                int behav, off_t value)
2025 {
2026         vm_map_entry_t current, entry;
2027         int modify_map = 0;
2028         int error = 0;
2029         int count;
2030
2031         /*
2032          * Some madvise calls directly modify the vm_map_entry, in which case
2033          * we need to use an exclusive lock on the map and we need to perform
2034          * various clipping operations.  Otherwise we only need a read-lock
2035          * on the map.
2036          */
2037         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2038
2039         switch(behav) {
2040         case MADV_NORMAL:
2041         case MADV_SEQUENTIAL:
2042         case MADV_RANDOM:
2043         case MADV_NOSYNC:
2044         case MADV_AUTOSYNC:
2045         case MADV_NOCORE:
2046         case MADV_CORE:
2047         case MADV_SETMAP:
2048                 modify_map = 1;
2049                 vm_map_lock(map);
2050                 break;
2051         case MADV_INVAL:
2052         case MADV_WILLNEED:
2053         case MADV_DONTNEED:
2054         case MADV_FREE:
2055                 vm_map_lock_read(map);
2056                 break;
2057         default:
2058                 vm_map_entry_release(count);
2059                 return (EINVAL);
2060         }
2061
2062         /*
2063          * Locate starting entry and clip if necessary.
2064          */
2065
2066         VM_MAP_RANGE_CHECK(map, start, end);
2067
2068         if (vm_map_lookup_entry(map, start, &entry)) {
2069                 if (modify_map)
2070                         vm_map_clip_start(map, entry, start, &count);
2071         } else {
2072                 entry = entry->next;
2073         }
2074
2075         if (modify_map) {
2076                 /*
2077                  * madvise behaviors that are implemented in the vm_map_entry.
2078                  *
2079                  * We clip the vm_map_entry so that behavioral changes are
2080                  * limited to the specified address range.
2081                  */
2082                 for (current = entry;
2083                      (current != &map->header) && (current->start < end);
2084                      current = current->next
2085                 ) {
2086                         if (current->maptype == VM_MAPTYPE_SUBMAP)
2087                                 continue;
2088
2089                         vm_map_clip_end(map, current, end, &count);
2090
2091                         switch (behav) {
2092                         case MADV_NORMAL:
2093                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2094                                 break;
2095                         case MADV_SEQUENTIAL:
2096                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2097                                 break;
2098                         case MADV_RANDOM:
2099                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2100                                 break;
2101                         case MADV_NOSYNC:
2102                                 current->eflags |= MAP_ENTRY_NOSYNC;
2103                                 break;
2104                         case MADV_AUTOSYNC:
2105                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2106                                 break;
2107                         case MADV_NOCORE:
2108                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2109                                 break;
2110                         case MADV_CORE:
2111                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2112                                 break;
2113                         case MADV_SETMAP:
2114                                 /*
2115                                  * Set the page directory page for a map
2116                                  * governed by a virtual page table.  Mark
2117                                  * the entry as being governed by a virtual
2118                                  * page table if it is not.
2119                                  *
2120                                  * XXX the page directory page is stored
2121                                  * in the avail_ssize field if the map_entry.
2122                                  *
2123                                  * XXX the map simplification code does not
2124                                  * compare this field so weird things may
2125                                  * happen if you do not apply this function
2126                                  * to the entire mapping governed by the
2127                                  * virtual page table.
2128                                  */
2129                                 if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2130                                         error = EINVAL;
2131                                         break;
2132                                 }
2133                                 current->aux.master_pde = value;
2134                                 pmap_remove(map->pmap,
2135                                             current->start, current->end);
2136                                 break;
2137                         case MADV_INVAL:
2138                                 /*
2139                                  * Invalidate the related pmap entries, used
2140                                  * to flush portions of the real kernel's
2141                                  * pmap when the caller has removed or
2142                                  * modified existing mappings in a virtual
2143                                  * page table.
2144                                  *
2145                                  * (exclusive locked map version does not
2146                                  * need the range interlock).
2147                                  */
2148                                 pmap_remove(map->pmap,
2149                                             current->start, current->end);
2150                                 break;
2151                         default:
2152                                 error = EINVAL;
2153                                 break;
2154                         }
2155                         vm_map_simplify_entry(map, current, &count);
2156                 }
2157                 vm_map_unlock(map);
2158         } else {
2159                 vm_pindex_t pindex;
2160                 vm_pindex_t delta;
2161
2162                 /*
2163                  * madvise behaviors that are implemented in the underlying
2164                  * vm_object.
2165                  *
2166                  * Since we don't clip the vm_map_entry, we have to clip
2167                  * the vm_object pindex and count.
2168                  *
2169                  * NOTE!  These functions are only supported on normal maps,
2170                  *        except MADV_INVAL which is also supported on
2171                  *        virtual page tables.
2172                  */
2173                 for (current = entry;
2174                      (current != &map->header) && (current->start < end);
2175                      current = current->next
2176                 ) {
2177                         vm_offset_t useStart;
2178
2179                         if (current->maptype != VM_MAPTYPE_NORMAL &&
2180                             (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2181                              behav != MADV_INVAL)) {
2182                                 continue;
2183                         }
2184
2185                         pindex = OFF_TO_IDX(current->offset);
2186                         delta = atop(current->end - current->start);
2187                         useStart = current->start;
2188
2189                         if (current->start < start) {
2190                                 pindex += atop(start - current->start);
2191                                 delta -= atop(start - current->start);
2192                                 useStart = start;
2193                         }
2194                         if (current->end > end)
2195                                 delta -= atop(current->end - end);
2196
2197                         if ((vm_spindex_t)delta <= 0)
2198                                 continue;
2199
2200                         if (behav == MADV_INVAL) {
2201                                 /*
2202                                  * Invalidate the related pmap entries, used
2203                                  * to flush portions of the real kernel's
2204                                  * pmap when the caller has removed or
2205                                  * modified existing mappings in a virtual
2206                                  * page table.
2207                                  *
2208                                  * (shared locked map version needs the
2209                                  * interlock, see vm_fault()).
2210                                  */
2211                                 struct vm_map_ilock ilock;
2212
2213                                 KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2214                                             useStart + ptoa(delta) <=
2215                                             VM_MAX_USER_ADDRESS,
2216                                          ("Bad range %016jx-%016jx (%016jx)",
2217                                          useStart, useStart + ptoa(delta),
2218                                          delta));
2219                                 vm_map_interlock(map, &ilock,
2220                                                  useStart,
2221                                                  useStart + ptoa(delta));
2222                                 pmap_remove(map->pmap,
2223                                             useStart,
2224                                             useStart + ptoa(delta));
2225                                 vm_map_deinterlock(map, &ilock);
2226                         } else {
2227                                 vm_object_madvise(current->object.vm_object,
2228                                                   pindex, delta, behav);
2229                         }
2230
2231                         /*
2232                          * Try to populate the page table.  Mappings governed
2233                          * by virtual page tables cannot be pre-populated
2234                          * without a lot of work so don't try.
2235                          */
2236                         if (behav == MADV_WILLNEED &&
2237                             current->maptype != VM_MAPTYPE_VPAGETABLE) {
2238                                 pmap_object_init_pt(
2239                                     map->pmap,
2240                                     useStart,
2241                                     current->protection,
2242                                     current->object.vm_object,
2243                                     pindex,
2244                                     (count << PAGE_SHIFT),
2245                                     MAP_PREFAULT_MADVISE
2246                                 );
2247                         }
2248                 }
2249                 vm_map_unlock_read(map);
2250         }
2251         vm_map_entry_release(count);
2252         return(error);
2253 }
2254
2255
2256 /*
2257  * Sets the inheritance of the specified address range in the target map.
2258  * Inheritance affects how the map will be shared with child maps at the
2259  * time of vm_map_fork.
2260  */
2261 int
2262 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2263                vm_inherit_t new_inheritance)
2264 {
2265         vm_map_entry_t entry;
2266         vm_map_entry_t temp_entry;
2267         int count;
2268
2269         switch (new_inheritance) {
2270         case VM_INHERIT_NONE:
2271         case VM_INHERIT_COPY:
2272         case VM_INHERIT_SHARE:
2273                 break;
2274         default:
2275                 return (KERN_INVALID_ARGUMENT);
2276         }
2277
2278         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2279         vm_map_lock(map);
2280
2281         VM_MAP_RANGE_CHECK(map, start, end);
2282
2283         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2284                 entry = temp_entry;
2285                 vm_map_clip_start(map, entry, start, &count);
2286         } else
2287                 entry = temp_entry->next;
2288
2289         while ((entry != &map->header) && (entry->start < end)) {
2290                 vm_map_clip_end(map, entry, end, &count);
2291
2292                 entry->inheritance = new_inheritance;
2293
2294                 vm_map_simplify_entry(map, entry, &count);
2295
2296                 entry = entry->next;
2297         }
2298         vm_map_unlock(map);
2299         vm_map_entry_release(count);
2300         return (KERN_SUCCESS);
2301 }
2302
2303 /*
2304  * Implement the semantics of mlock
2305  */
2306 int
2307 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2308               boolean_t new_pageable)
2309 {
2310         vm_map_entry_t entry;
2311         vm_map_entry_t start_entry;
2312         vm_offset_t end;
2313         int rv = KERN_SUCCESS;
2314         int count;
2315
2316         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2317         vm_map_lock(map);
2318         VM_MAP_RANGE_CHECK(map, start, real_end);
2319         end = real_end;
2320
2321         start_entry = vm_map_clip_range(map, start, end, &count,
2322                                         MAP_CLIP_NO_HOLES);
2323         if (start_entry == NULL) {
2324                 vm_map_unlock(map);
2325                 vm_map_entry_release(count);
2326                 return (KERN_INVALID_ADDRESS);
2327         }
2328
2329         if (new_pageable == 0) {
2330                 entry = start_entry;
2331                 while ((entry != &map->header) && (entry->start < end)) {
2332                         vm_offset_t save_start;
2333                         vm_offset_t save_end;
2334
2335                         /*
2336                          * Already user wired or hard wired (trivial cases)
2337                          */
2338                         if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2339                                 entry = entry->next;
2340                                 continue;
2341                         }
2342                         if (entry->wired_count != 0) {
2343                                 entry->wired_count++;
2344                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2345                                 entry = entry->next;
2346                                 continue;
2347                         }
2348
2349                         /*
2350                          * A new wiring requires instantiation of appropriate
2351                          * management structures and the faulting in of the
2352                          * page.
2353                          */
2354                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2355                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2356                                 int copyflag = entry->eflags &
2357                                                MAP_ENTRY_NEEDS_COPY;
2358                                 if (copyflag && ((entry->protection &
2359                                                   VM_PROT_WRITE) != 0)) {
2360                                         vm_map_entry_shadow(entry, 0);
2361                                 } else if (entry->object.vm_object == NULL &&
2362                                            !map->system_map) {
2363                                         vm_map_entry_allocate_object(entry);
2364                                 }
2365                         }
2366                         entry->wired_count++;
2367                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2368
2369                         /*
2370                          * Now fault in the area.  Note that vm_fault_wire()
2371                          * may release the map lock temporarily, it will be
2372                          * relocked on return.  The in-transition
2373                          * flag protects the entries.
2374                          */
2375                         save_start = entry->start;
2376                         save_end = entry->end;
2377                         rv = vm_fault_wire(map, entry, TRUE, 0);
2378                         if (rv) {
2379                                 CLIP_CHECK_BACK(entry, save_start);
2380                                 for (;;) {
2381                                         KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2382                                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2383                                         entry->wired_count = 0;
2384                                         if (entry->end == save_end)
2385                                                 break;
2386                                         entry = entry->next;
2387                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2388                                 }
2389                                 end = save_start;       /* unwire the rest */
2390                                 break;
2391                         }
2392                         /*
2393                          * note that even though the entry might have been
2394                          * clipped, the USER_WIRED flag we set prevents
2395                          * duplication so we do not have to do a
2396                          * clip check.
2397                          */
2398                         entry = entry->next;
2399                 }
2400
2401                 /*
2402                  * If we failed fall through to the unwiring section to
2403                  * unwire what we had wired so far.  'end' has already
2404                  * been adjusted.
2405                  */
2406                 if (rv)
2407                         new_pageable = 1;
2408
2409                 /*
2410                  * start_entry might have been clipped if we unlocked the
2411                  * map and blocked.  No matter how clipped it has gotten
2412                  * there should be a fragment that is on our start boundary.
2413                  */
2414                 CLIP_CHECK_BACK(start_entry, start);
2415         }
2416
2417         /*
2418          * Deal with the unwiring case.
2419          */
2420         if (new_pageable) {
2421                 /*
2422                  * This is the unwiring case.  We must first ensure that the
2423                  * range to be unwired is really wired down.  We know there
2424                  * are no holes.
2425                  */
2426                 entry = start_entry;
2427                 while ((entry != &map->header) && (entry->start < end)) {
2428                         if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2429                                 rv = KERN_INVALID_ARGUMENT;
2430                                 goto done;
2431                         }
2432                         KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
2433                         entry = entry->next;
2434                 }
2435
2436                 /*
2437                  * Now decrement the wiring count for each region. If a region
2438                  * becomes completely unwired, unwire its physical pages and
2439                  * mappings.
2440                  */
2441                 /*
2442                  * The map entries are processed in a loop, checking to
2443                  * make sure the entry is wired and asserting it has a wired
2444                  * count. However, another loop was inserted more-or-less in
2445                  * the middle of the unwiring path. This loop picks up the
2446                  * "entry" loop variable from the first loop without first
2447                  * setting it to start_entry. Naturally, the secound loop
2448                  * is never entered and the pages backing the entries are
2449                  * never unwired. This can lead to a leak of wired pages.
2450                  */
2451                 entry = start_entry;
2452                 while ((entry != &map->header) && (entry->start < end)) {
2453                         KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2454                                 ("expected USER_WIRED on entry %p", entry));
2455                         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2456                         entry->wired_count--;
2457                         if (entry->wired_count == 0)
2458                                 vm_fault_unwire(map, entry);
2459                         entry = entry->next;
2460                 }
2461         }
2462 done:
2463         vm_map_unclip_range(map, start_entry, start, real_end, &count,
2464                 MAP_CLIP_NO_HOLES);
2465         vm_map_unlock(map);
2466         vm_map_entry_release(count);
2467
2468         return (rv);
2469 }
2470
2471 /*
2472  * Sets the pageability of the specified address range in the target map.
2473  * Regions specified as not pageable require locked-down physical
2474  * memory and physical page maps.
2475  *
2476  * The map must not be locked, but a reference must remain to the map
2477  * throughout the call.
2478  *
2479  * This function may be called via the zalloc path and must properly
2480  * reserve map entries for kernel_map.
2481  *
2482  * No requirements.
2483  */
2484 int
2485 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2486 {
2487         vm_map_entry_t entry;
2488         vm_map_entry_t start_entry;
2489         vm_offset_t end;
2490         int rv = KERN_SUCCESS;
2491         int count;
2492
2493         if (kmflags & KM_KRESERVE)
2494                 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2495         else
2496                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2497         vm_map_lock(map);
2498         VM_MAP_RANGE_CHECK(map, start, real_end);
2499         end = real_end;
2500
2501         start_entry = vm_map_clip_range(map, start, end, &count,
2502                                         MAP_CLIP_NO_HOLES);
2503         if (start_entry == NULL) {
2504                 vm_map_unlock(map);
2505                 rv = KERN_INVALID_ADDRESS;
2506                 goto failure;
2507         }
2508         if ((kmflags & KM_PAGEABLE) == 0) {
2509                 /*
2510                  * Wiring.
2511                  *
2512                  * 1.  Holding the write lock, we create any shadow or zero-fill
2513                  * objects that need to be created. Then we clip each map
2514                  * entry to the region to be wired and increment its wiring
2515                  * count.  We create objects before clipping the map entries
2516                  * to avoid object proliferation.
2517                  *
2518                  * 2.  We downgrade to a read lock, and call vm_fault_wire to
2519                  * fault in the pages for any newly wired area (wired_count is
2520                  * 1).
2521                  *
2522                  * Downgrading to a read lock for vm_fault_wire avoids a
2523                  * possible deadlock with another process that may have faulted
2524                  * on one of the pages to be wired (it would mark the page busy,
2525                  * blocking us, then in turn block on the map lock that we
2526                  * hold).  Because of problems in the recursive lock package,
2527                  * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2528                  * any actions that require the write lock must be done
2529                  * beforehand.  Because we keep the read lock on the map, the
2530                  * copy-on-write status of the entries we modify here cannot
2531                  * change.
2532                  */
2533                 entry = start_entry;
2534                 while ((entry != &map->header) && (entry->start < end)) {
2535                         /*
2536                          * Trivial case if the entry is already wired
2537                          */
2538                         if (entry->wired_count) {
2539                                 entry->wired_count++;
2540                                 entry = entry->next;
2541                                 continue;
2542                         }
2543
2544                         /*
2545                          * The entry is being newly wired, we have to setup
2546                          * appropriate management structures.  A shadow
2547                          * object is required for a copy-on-write region,
2548                          * or a normal object for a zero-fill region.  We
2549                          * do not have to do this for entries that point to sub
2550                          * maps because we won't hold the lock on the sub map.
2551                          */
2552                         if (entry->maptype == VM_MAPTYPE_NORMAL ||
2553                             entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2554                                 int copyflag = entry->eflags &
2555                                                MAP_ENTRY_NEEDS_COPY;
2556                                 if (copyflag && ((entry->protection &
2557                                                   VM_PROT_WRITE) != 0)) {
2558                                         vm_map_entry_shadow(entry, 0);
2559                                 } else if (entry->object.vm_object == NULL &&
2560                                            !map->system_map) {
2561                                         vm_map_entry_allocate_object(entry);
2562                                 }
2563                         }
2564
2565                         entry->wired_count++;
2566                         entry = entry->next;
2567                 }
2568
2569                 /*
2570                  * Pass 2.
2571                  */
2572
2573                 /*
2574                  * HACK HACK HACK HACK
2575                  *
2576                  * vm_fault_wire() temporarily unlocks the map to avoid
2577                  * deadlocks.  The in-transition flag from vm_map_clip_range
2578                  * call should protect us from changes while the map is
2579                  * unlocked.  T
2580                  *
2581                  * NOTE: Previously this comment stated that clipping might
2582                  *       still occur while the entry is unlocked, but from
2583                  *       what I can tell it actually cannot.
2584                  *
2585                  *       It is unclear whether the CLIP_CHECK_*() calls
2586                  *       are still needed but we keep them in anyway.
2587                  *
2588                  * HACK HACK HACK HACK
2589                  */
2590
2591                 entry = start_entry;
2592                 while (entry != &map->header && entry->start < end) {
2593                         /*
2594                          * If vm_fault_wire fails for any page we need to undo
2595                          * what has been done.  We decrement the wiring count
2596                          * for those pages which have not yet been wired (now)
2597                          * and unwire those that have (later).
2598                          */
2599                         vm_offset_t save_start = entry->start;
2600                         vm_offset_t save_end = entry->end;
2601
2602                         if (entry->wired_count == 1)
2603                                 rv = vm_fault_wire(map, entry, FALSE, kmflags);
2604                         if (rv) {
2605                                 CLIP_CHECK_BACK(entry, save_start);
2606                                 for (;;) {
2607                                         KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
2608                                         entry->wired_count = 0;
2609                                         if (entry->end == save_end)
2610                                                 break;
2611                                         entry = entry->next;
2612                                         KASSERT(entry != &map->header, ("bad entry clip during backout"));
2613                                 }
2614                                 end = save_start;
2615                                 break;
2616                         }
2617                         CLIP_CHECK_FWD(entry, save_end);
2618                         entry = entry->next;
2619                 }
2620
2621                 /*
2622                  * If a failure occured undo everything by falling through
2623                  * to the unwiring code.  'end' has already been adjusted
2624                  * appropriately.
2625                  */
2626                 if (rv)
2627                         kmflags |= KM_PAGEABLE;
2628
2629                 /*
2630                  * start_entry is still IN_TRANSITION but may have been
2631                  * clipped since vm_fault_wire() unlocks and relocks the
2632                  * map.  No matter how clipped it has gotten there should
2633                  * be a fragment that is on our start boundary.
2634                  */
2635                 CLIP_CHECK_BACK(start_entry, start);
2636         }
2637
2638         if (kmflags & KM_PAGEABLE) {
2639                 /*
2640                  * This is the unwiring case.  We must first ensure that the
2641                  * range to be unwired is really wired down.  We know there
2642                  * are no holes.
2643                  */
2644                 entry = start_entry;
2645                 while ((entry != &map->header) && (entry->start < end)) {
2646                         if (entry->wired_count == 0) {
2647                                 rv = KERN_INVALID_ARGUMENT;
2648                                 goto done;
2649                         }
2650                         entry = entry->next;
2651                 }
2652
2653                 /*
2654                  * Now decrement the wiring count for each region. If a region
2655                  * becomes completely unwired, unwire its physical pages and
2656                  * mappings.
2657                  */
2658                 entry = start_entry;
2659                 while ((entry != &map->header) && (entry->start < end)) {
2660                         entry->wired_count--;
2661                         if (entry->wired_count == 0)
2662                                 vm_fault_unwire(map, entry);
2663                         entry = entry->next;
2664                 }
2665         }
2666 done:
2667         vm_map_unclip_range(map, start_entry, start, real_end,
2668                             &count, MAP_CLIP_NO_HOLES);
2669         vm_map_unlock(map);
2670 failure:
2671         if (kmflags & KM_KRESERVE)
2672                 vm_map_entry_krelease(count);
2673         else
2674                 vm_map_entry_release(count);
2675         return (rv);
2676 }
2677
2678 /*
2679  * Mark a newly allocated address range as wired but do not fault in
2680  * the pages.  The caller is expected to load the pages into the object.
2681  *
2682  * The map must be locked on entry and will remain locked on return.
2683  * No other requirements.
2684  */
2685 void
2686 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2687                        int *countp)
2688 {
2689         vm_map_entry_t scan;
2690         vm_map_entry_t entry;
2691
2692         entry = vm_map_clip_range(map, addr, addr + size,
2693                                   countp, MAP_CLIP_NO_HOLES);
2694         for (scan = entry;
2695              scan != &map->header && scan->start < addr + size;
2696              scan = scan->next) {
2697             KKASSERT(scan->wired_count == 0);
2698             scan->wired_count = 1;
2699         }
2700         vm_map_unclip_range(map, entry, addr, addr + size,
2701                             countp, MAP_CLIP_NO_HOLES);
2702 }
2703
2704 /*
2705  * Push any dirty cached pages in the address range to their pager.
2706  * If syncio is TRUE, dirty pages are written synchronously.
2707  * If invalidate is TRUE, any cached pages are freed as well.
2708  *
2709  * This routine is called by sys_msync()
2710  *
2711  * Returns an error if any part of the specified range is not mapped.
2712  *
2713  * No requirements.
2714  */
2715 int
2716 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2717              boolean_t syncio, boolean_t invalidate)
2718 {
2719         vm_map_entry_t current;
2720         vm_map_entry_t entry;
2721         vm_size_t size;
2722         vm_object_t object;
2723         vm_object_t tobj;
2724         vm_ooffset_t offset;
2725
2726         vm_map_lock_read(map);
2727         VM_MAP_RANGE_CHECK(map, start, end);
2728         if (!vm_map_lookup_entry(map, start, &entry)) {
2729                 vm_map_unlock_read(map);
2730                 return (KERN_INVALID_ADDRESS);
2731         }
2732         lwkt_gettoken(&map->token);
2733
2734         /*
2735          * Make a first pass to check for holes.
2736          */
2737         for (current = entry; current->start < end; current = current->next) {
2738                 if (current->maptype == VM_MAPTYPE_SUBMAP) {
2739                         lwkt_reltoken(&map->token);
2740                         vm_map_unlock_read(map);
2741                         return (KERN_INVALID_ARGUMENT);
2742                 }
2743                 if (end > current->end &&
2744                     (current->next == &map->header ||
2745                         current->end != current->next->start)) {
2746                         lwkt_reltoken(&map->token);
2747                         vm_map_unlock_read(map);
2748                         return (KERN_INVALID_ADDRESS);
2749                 }
2750         }
2751
2752         if (invalidate)
2753                 pmap_remove(vm_map_pmap(map), start, end);
2754
2755         /*
2756          * Make a second pass, cleaning/uncaching pages from the indicated
2757          * objects as we go.
2758          */
2759         for (current = entry; current->start < end; current = current->next) {
2760                 offset = current->offset + (start - current->start);
2761                 size = (end <= current->end ? end : current->end) - start;
2762
2763                 switch(current->maptype) {
2764                 case VM_MAPTYPE_SUBMAP:
2765                 {
2766                         vm_map_t smap;
2767                         vm_map_entry_t tentry;
2768                         vm_size_t tsize;
2769
2770                         smap = current->object.sub_map;
2771                         vm_map_lock_read(smap);
2772                         vm_map_lookup_entry(smap, offset, &tentry);
2773                         tsize = tentry->end - offset;
2774                         if (tsize < size)
2775                                 size = tsize;
2776                         object = tentry->object.vm_object;
2777                         offset = tentry->offset + (offset - tentry->start);
2778                         vm_map_unlock_read(smap);
2779                         break;
2780                 }
2781                 case VM_MAPTYPE_NORMAL:
2782                 case VM_MAPTYPE_VPAGETABLE:
2783                         object = current->object.vm_object;
2784                         break;
2785                 default:
2786                         object = NULL;
2787                         break;
2788                 }
2789
2790                 if (object)
2791                         vm_object_hold(object);
2792
2793                 /*
2794                  * Note that there is absolutely no sense in writing out
2795                  * anonymous objects, so we track down the vnode object
2796                  * to write out.
2797                  * We invalidate (remove) all pages from the address space
2798                  * anyway, for semantic correctness.
2799                  *
2800                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
2801                  * may start out with a NULL object.
2802                  */
2803                 while (object && (tobj = object->backing_object) != NULL) {
2804                         vm_object_hold(tobj);
2805                         if (tobj == object->backing_object) {
2806                                 vm_object_lock_swap();
2807                                 offset += object->backing_object_offset;
2808                                 vm_object_drop(object);
2809                                 object = tobj;
2810                                 if (object->size < OFF_TO_IDX(offset + size))
2811                                         size = IDX_TO_OFF(object->size) -
2812                                                offset;
2813                                 break;
2814                         }
2815                         vm_object_drop(tobj);
2816                 }
2817                 if (object && (object->type == OBJT_VNODE) &&
2818                     (current->protection & VM_PROT_WRITE) &&
2819                     (object->flags & OBJ_NOMSYNC) == 0) {
2820                         /*
2821                          * Flush pages if writing is allowed, invalidate them
2822                          * if invalidation requested.  Pages undergoing I/O
2823                          * will be ignored by vm_object_page_remove().
2824                          *
2825                          * We cannot lock the vnode and then wait for paging
2826                          * to complete without deadlocking against vm_fault.
2827                          * Instead we simply call vm_object_page_remove() and
2828                          * allow it to block internally on a page-by-page
2829                          * basis when it encounters pages undergoing async
2830                          * I/O.
2831                          */
2832                         int flags;
2833
2834                         /* no chain wait needed for vnode objects */
2835                         vm_object_reference_locked(object);
2836                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
2837                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
2838                         flags |= invalidate ? OBJPC_INVAL : 0;
2839
2840                         /*
2841                          * When operating on a virtual page table just
2842                          * flush the whole object.  XXX we probably ought
2843                          * to
2844                          */
2845                         switch(current->maptype) {
2846                         case VM_MAPTYPE_NORMAL:
2847                                 vm_object_page_clean(object,
2848                                     OFF_TO_IDX(offset),
2849                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2850                                     flags);
2851                                 break;
2852                         case VM_MAPTYPE_VPAGETABLE:
2853                                 vm_object_page_clean(object, 0, 0, flags);
2854                                 break;
2855                         }
2856                         vn_unlock(((struct vnode *)object->handle));
2857                         vm_object_deallocate_locked(object);
2858                 }
2859                 if (object && invalidate &&
2860                    ((object->type == OBJT_VNODE) ||
2861                     (object->type == OBJT_DEVICE) ||
2862                     (object->type == OBJT_MGTDEVICE))) {
2863                         int clean_only =
2864                                 ((object->type == OBJT_DEVICE) ||
2865                                 (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
2866                         /* no chain wait needed for vnode/device objects */
2867                         vm_object_reference_locked(object);
2868                         switch(current->maptype) {
2869                         case VM_MAPTYPE_NORMAL:
2870                                 vm_object_page_remove(object,
2871                                     OFF_TO_IDX(offset),
2872                                     OFF_TO_IDX(offset + size + PAGE_MASK),
2873                                     clean_only);
2874                                 break;
2875                         case VM_MAPTYPE_VPAGETABLE:
2876                                 vm_object_page_remove(object, 0, 0, clean_only);
2877                                 break;
2878                         }
2879                         vm_object_deallocate_locked(object);
2880                 }
2881                 start += size;
2882                 if (object)
2883                         vm_object_drop(object);
2884         }
2885
2886         lwkt_reltoken(&map->token);
2887         vm_map_unlock_read(map);
2888
2889         return (KERN_SUCCESS);
2890 }
2891
2892 /*
2893  * Make the region specified by this entry pageable.
2894  *
2895  * The vm_map must be exclusively locked.
2896  */
2897 static void
2898 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2899 {
2900         entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2901         entry->wired_count = 0;
2902         vm_fault_unwire(map, entry);
2903 }
2904
2905 /*
2906  * Deallocate the given entry from the target map.
2907  *
2908  * The vm_map must be exclusively locked.
2909  */
2910 static void
2911 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
2912 {
2913         vm_map_entry_unlink(map, entry);
2914         map->size -= entry->end - entry->start;
2915
2916         switch(entry->maptype) {
2917         case VM_MAPTYPE_NORMAL:
2918         case VM_MAPTYPE_VPAGETABLE:
2919         case VM_MAPTYPE_SUBMAP:
2920                 vm_object_deallocate(entry->object.vm_object);
2921                 break;
2922         case VM_MAPTYPE_UKSMAP:
2923                 /* XXX TODO */
2924                 break;
2925         default:
2926                 break;
2927         }
2928
2929         vm_map_entry_dispose(map, entry, countp);
2930 }
2931
2932 /*
2933  * Deallocates the given address range from the target map.
2934  *
2935  * The vm_map must be exclusively locked.
2936  */
2937 int
2938 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
2939 {
2940         vm_object_t object;
2941         vm_map_entry_t entry;
2942         vm_map_entry_t first_entry;
2943         vm_offset_t hole_start;
2944
2945         ASSERT_VM_MAP_LOCKED(map);
2946         lwkt_gettoken(&map->token);
2947 again:
2948         /*
2949          * Find the start of the region, and clip it.  Set entry to point
2950          * at the first record containing the requested address or, if no
2951          * such record exists, the next record with a greater address.  The
2952          * loop will run from this point until a record beyond the termination
2953          * address is encountered.
2954          *
2955          * Adjust freehint[] for either the clip case or the extension case.
2956          *
2957          * GGG see other GGG comment.
2958          */
2959         if (vm_map_lookup_entry(map, start, &first_entry)) {
2960                 entry = first_entry;
2961                 vm_map_clip_start(map, entry, start, countp);
2962                 hole_start = start;
2963         } else {
2964                 entry = first_entry->next;
2965                 if (entry == &map->header)
2966                         hole_start = first_entry->start;
2967                 else
2968                         hole_start = first_entry->end;
2969         }
2970
2971         /*
2972          * Step through all entries in this region
2973          */
2974         while ((entry != &map->header) && (entry->start < end)) {
2975                 vm_map_entry_t next;
2976                 vm_offset_t s, e;
2977                 vm_pindex_t offidxstart, offidxend, count;
2978
2979                 /*
2980                  * If we hit an in-transition entry we have to sleep and
2981                  * retry.  It's easier (and not really slower) to just retry
2982                  * since this case occurs so rarely and the hint is already
2983                  * pointing at the right place.  We have to reset the
2984                  * start offset so as not to accidently delete an entry
2985                  * another process just created in vacated space.
2986                  */
2987                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2988                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2989                         start = entry->start;
2990                         ++mycpu->gd_cnt.v_intrans_coll;
2991                         ++mycpu->gd_cnt.v_intrans_wait;
2992                         vm_map_transition_wait(map, 1);
2993                         goto again;
2994                 }
2995                 vm_map_clip_end(map, entry, end, countp);
2996
2997                 s = entry->start;
2998                 e = entry->end;
2999                 next = entry->next;
3000
3001                 offidxstart = OFF_TO_IDX(entry->offset);
3002                 count = OFF_TO_IDX(e - s);
3003
3004                 switch(entry->maptype) {
3005                 case VM_MAPTYPE_NORMAL:
3006                 case VM_MAPTYPE_VPAGETABLE:
3007                 case VM_MAPTYPE_SUBMAP:
3008                         object = entry->object.vm_object;
3009                         break;
3010                 default:
3011                         object = NULL;
3012                         break;
3013                 }
3014
3015                 /*
3016                  * Unwire before removing addresses from the pmap; otherwise,
3017                  * unwiring will put the entries back in the pmap.
3018                  *
3019                  * Generally speaking, doing a bulk pmap_remove() before
3020                  * removing the pages from the VM object is better at
3021                  * reducing unnecessary IPIs.  The pmap code is now optimized
3022                  * to not blindly iterate the range when pt and pd pages
3023                  * are missing.
3024                  */
3025                 if (entry->wired_count != 0)
3026                         vm_map_entry_unwire(map, entry);
3027
3028                 offidxend = offidxstart + count;
3029
3030                 if (object == &kernel_object) {
3031                         pmap_remove(map->pmap, s, e);
3032                         vm_object_hold(object);
3033                         vm_object_page_remove(object, offidxstart,
3034                                               offidxend, FALSE);
3035                         vm_object_drop(object);
3036                 } else if (object && object->type != OBJT_DEFAULT &&
3037                            object->type != OBJT_SWAP) {
3038                         /*
3039                          * vnode object routines cannot be chain-locked,
3040                          * but since we aren't removing pages from the
3041                          * object here we can use a shared hold.
3042                          */
3043                         vm_object_hold_shared(object);
3044                         pmap_remove(map->pmap, s, e);
3045                         vm_object_drop(object);
3046                 } else if (object) {
3047                         vm_object_hold(object);
3048                         vm_object_chain_acquire(object, 0);
3049                         pmap_remove(map->pmap, s, e);
3050
3051                         if (object != NULL &&
3052                             object->ref_count != 1 &&
3053                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3054                              OBJ_ONEMAPPING &&
3055                             (object->type == OBJT_DEFAULT ||
3056                              object->type == OBJT_SWAP)) {
3057                                 /*
3058                                  * When ONEMAPPING is set we can destroy the
3059                                  * pages underlying the entry's range.
3060                                  */
3061                                 vm_object_collapse(object, NULL);
3062                                 vm_object_page_remove(object, offidxstart,
3063                                                       offidxend, FALSE);
3064                                 if (object->type == OBJT_SWAP) {
3065                                         swap_pager_freespace(object,
3066                                                              offidxstart,
3067                                                              count);
3068                                 }
3069                                 if (offidxend >= object->size &&
3070                                     offidxstart < object->size) {
3071                                         object->size = offidxstart;
3072                                 }
3073                         }
3074                         vm_object_chain_release(object);
3075                         vm_object_drop(object);
3076                 } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3077                         pmap_remove(map->pmap, s, e);
3078                 }
3079
3080                 /*
3081                  * Delete the entry (which may delete the object) only after
3082                  * removing all pmap entries pointing to its pages.
3083                  * (Otherwise, its page frames may be reallocated, and any
3084                  * modify bits will be set in the wrong object!)
3085                  */
3086                 vm_map_entry_delete(map, entry, countp);
3087                 entry = next;
3088         }
3089         if (entry == &map->header)
3090                 vm_map_freehint_hole(map, hole_start, entry->end - hole_start);
3091         else
3092                 vm_map_freehint_hole(map, hole_start,
3093                                      entry->start - hole_start);
3094
3095         lwkt_reltoken(&map->token);
3096
3097         return (KERN_SUCCESS);
3098 }
3099
3100 /*
3101  * Remove the given address range from the target map.
3102  * This is the exported form of vm_map_delete.
3103  *
3104  * No requirements.
3105  */
3106 int
3107 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3108 {
3109         int result;
3110         int count;
3111
3112         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3113         vm_map_lock(map);
3114         VM_MAP_RANGE_CHECK(map, start, end);
3115         result = vm_map_delete(map, start, end, &count);
3116         vm_map_unlock(map);
3117         vm_map_entry_release(count);
3118
3119         return (result);
3120 }
3121
3122 /*
3123  * Assert that the target map allows the specified privilege on the
3124  * entire address region given.  The entire region must be allocated.
3125  *
3126  * The caller must specify whether the vm_map is already locked or not.
3127  */
3128 boolean_t
3129 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3130                         vm_prot_t protection, boolean_t have_lock)
3131 {
3132         vm_map_entry_t entry;
3133         vm_map_entry_t tmp_entry;
3134         boolean_t result;
3135
3136         if (have_lock == FALSE)
3137                 vm_map_lock_read(map);
3138
3139         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3140                 if (have_lock == FALSE)
3141                         vm_map_unlock_read(map);
3142                 return (FALSE);
3143         }
3144         entry = tmp_entry;
3145
3146         result = TRUE;
3147         while (start < end) {
3148                 if (entry == &map->header) {
3149                         result = FALSE;
3150                         break;
3151                 }
3152                 /*
3153                  * No holes allowed!
3154                  */
3155
3156                 if (start < entry->start) {
3157                         result = FALSE;
3158                         break;
3159                 }
3160                 /*
3161                  * Check protection associated with entry.
3162                  */
3163
3164                 if ((entry->protection & protection) != protection) {
3165                         result = FALSE;
3166                         break;
3167                 }
3168                 /* go to next entry */
3169
3170                 start = entry->end;
3171                 entry = entry->next;
3172         }
3173         if (have_lock == FALSE)
3174                 vm_map_unlock_read(map);
3175         return (result);
3176 }
3177
3178 /*
3179  * If appropriate this function shadows the original object with a new object
3180  * and moves the VM pages from the original object to the new object.
3181  * The original object will also be collapsed, if possible.
3182  *
3183  * Caller must supply entry->object.vm_object held and chain_acquired, and
3184  * should chain_release and drop the object upon return.
3185  *
3186  * We can only do this for normal memory objects with a single mapping, and
3187  * it only makes sense to do it if there are 2 or more refs on the original
3188  * object.  i.e. typically a memory object that has been extended into
3189  * multiple vm_map_entry's with non-overlapping ranges.
3190  *
3191  * This makes it easier to remove unused pages and keeps object inheritance
3192  * from being a negative impact on memory usage.
3193  *
3194  * On return the (possibly new) entry->object.vm_object will have an
3195  * additional ref on it for the caller to dispose of (usually by cloning
3196  * the vm_map_entry).  The additional ref had to be done in this routine
3197  * to avoid racing a collapse.  The object's ONEMAPPING flag will also be
3198  * cleared.
3199  *
3200  * The vm_map must be locked and its token held.
3201  */
3202 static void
3203 vm_map_split(vm_map_entry_t entry, vm_object_t oobject)
3204 {
3205         /* OPTIMIZED */
3206         vm_object_t nobject, bobject;
3207         vm_offset_t s, e;
3208         vm_page_t m;
3209         vm_pindex_t offidxstart, offidxend, idx;
3210         vm_size_t size;
3211         vm_ooffset_t offset;
3212         int useshadowlist;
3213
3214         /*
3215          * Optimize away object locks for vnode objects.  Important exit/exec
3216          * critical path.
3217          *
3218          * OBJ_ONEMAPPING doesn't apply to vnode objects but clear the flag
3219          * anyway.
3220          */
3221         if (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) {
3222                 vm_object_reference_quick(oobject);
3223                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3224                 return;
3225         }
3226
3227 #if 0
3228         /*
3229          * Original object cannot be split?
3230          */
3231         if (oobject->handle == NULL) {
3232                 vm_object_reference_locked_chain_held(oobject);
3233                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3234                 return;
3235         }
3236 #endif
3237
3238         /*
3239          * Collapse original object with its backing store as an
3240          * optimization to reduce chain lengths when possible.
3241          *
3242          * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's
3243          * for oobject, so there's no point collapsing it.
3244          *
3245          * Then re-check whether the object can be split.
3246          */
3247         vm_object_collapse(oobject, NULL);
3248
3249         if (oobject->ref_count <= 1 ||
3250             (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) ||
3251             (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) {
3252                 vm_object_reference_locked_chain_held(oobject);
3253                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3254                 return;
3255         }
3256
3257         /*
3258          * Acquire the chain lock on the backing object.
3259          *
3260          * Give bobject an additional ref count for when it will be shadowed
3261          * by nobject.
3262          */
3263         useshadowlist = 0;
3264         if ((bobject = oobject->backing_object) != NULL) {
3265                 if (bobject->type != OBJT_VNODE) {
3266                         useshadowlist = 1;
3267                         vm_object_hold(bobject);
3268                         vm_object_chain_wait(bobject, 0);
3269                         /* ref for shadowing below */
3270                         vm_object_reference_locked(bobject);
3271                         vm_object_chain_acquire(bobject, 0);
3272                         KKASSERT(oobject->backing_object == bobject);
3273                         KKASSERT((bobject->flags & OBJ_DEAD) == 0);
3274                 } else {
3275                         /*
3276                          * vnodes are not placed on the shadow list but
3277                          * they still get another ref for the backing_object
3278                          * reference.
3279                          */
3280                         vm_object_reference_quick(bobject);
3281                 }
3282         }
3283
3284         /*
3285          * Calculate the object page range and allocate the new object.
3286          */
3287         offset = entry->offset;
3288         s = entry->start;
3289         e = entry->end;
3290
3291         offidxstart = OFF_TO_IDX(offset);
3292         offidxend = offidxstart + OFF_TO_IDX(e - s);
3293         size = offidxend - offidxstart;
3294
3295         switch(oobject->type) {
3296         case OBJT_DEFAULT:
3297                 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size),
3298                                               VM_PROT_ALL, 0);
3299                 break;
3300         case OBJT_SWAP:
3301                 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size),
3302                                            VM_PROT_ALL, 0);
3303                 break;
3304         default:
3305                 /* not reached */
3306                 nobject = NULL;
3307                 KKASSERT(0);
3308         }
3309
3310         /*
3311          * If we could not allocate nobject just clear ONEMAPPING on
3312          * oobject and return.
3313          */
3314         if (nobject == NULL) {
3315                 if (bobject) {
3316                         if (useshadowlist) {
3317                                 vm_object_chain_release(bobject);
3318                                 vm_object_deallocate(bobject);
3319                                 vm_object_drop(bobject);
3320                         } else {
3321                                 vm_object_deallocate(bobject);
3322                         }
3323                 }
3324                 vm_object_reference_locked_chain_held(oobject);
3325                 vm_object_clear_flag(oobject, OBJ_ONEMAPPING);
3326                 return;
3327         }
3328
3329         /*
3330          * The new object will replace entry->object.vm_object so it needs
3331          * a second reference (the caller expects an additional ref).
3332          */
3333         vm_object_hold(nobject);
3334         vm_object_reference_locked(nobject);
3335         vm_object_chain_acquire(nobject, 0);
3336
3337         /*
3338          * nobject shadows bobject (oobject already shadows bobject).
3339          *
3340          * Adding an object to bobject's shadow list requires refing bobject
3341          * which we did above in the useshadowlist case.
3342          *
3343          * XXX it is unclear if we need to clear ONEMAPPING on bobject here
3344          *     or not.
3345          */
3346         if (bobject) {
3347                 nobject->backing_object_offset =
3348                     oobject->backing_object_offset + IDX_TO_OFF(offidxstart);
3349                 nobject->backing_object = bobject;
3350                 if (useshadowlist) {
3351                         bobject->shadow_count++;
3352                         atomic_add_int(&bobject->generation, 1);
3353                         LIST_INSERT_HEAD(&bobject->shadow_head,
3354                                          nobject, shadow_list);
3355                         vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /*XXX*/
3356                         vm_object_set_flag(nobject, OBJ_ONSHADOW);
3357                 }
3358         }
3359
3360         /*
3361          * Move the VM pages from oobject to nobject
3362          */
3363         for (idx = 0; idx < size; idx++) {
3364                 vm_page_t m;
3365
3366                 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3367                                              TRUE, "vmpg");
3368                 if (m == NULL)
3369                         continue;
3370
3371                 /*
3372                  * We must wait for pending I/O to complete before we can
3373                  * rename the page.
3374                  *
3375                  * We do not have to VM_PROT_NONE the page as mappings should
3376                  * not be changed by this operation.
3377                  *
3378                  * NOTE: The act of renaming a page updates chaingen for both
3379                  *       objects.
3380                  */
3381                 vm_page_rename(m, nobject, idx);
3382                 /* page automatically made dirty by rename and cache handled */
3383                 /* page remains busy */
3384         }
3385
3386         if (oobject->type == OBJT_SWAP) {
3387                 vm_object_pip_add(oobject, 1);
3388                 /*
3389                  * copy oobject pages into nobject and destroy unneeded
3390                  * pages in shadow object.
3391                  */
3392                 swap_pager_copy(oobject, nobject, offidxstart, 0);
3393                 vm_object_pip_wakeup(oobject);
3394         }
3395
3396         /*
3397          * Wakeup the pages we played with.  No spl protection is needed
3398          * for a simple wakeup.
3399          */
3400         for (idx = 0; idx < size; idx++) {
3401                 m = vm_page_lookup(nobject, idx);
3402                 if (m) {
3403                         KKASSERT(m->busy_count & PBUSY_LOCKED);
3404                         vm_page_wakeup(m);
3405                 }
3406         }
3407         entry->object.vm_object = nobject;
3408         entry->offset = 0LL;
3409
3410         /*
3411          * The map is being split and nobject is going to wind up on both
3412          * vm_map_entry's, so make sure OBJ_ONEMAPPING is cleared on
3413          * nobject.
3414          */
3415         vm_object_clear_flag(nobject, OBJ_ONEMAPPING);
3416
3417         /*
3418          * Cleanup
3419          *
3420          * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the
3421          *       related pages were moved and are no longer applicable to the
3422          *       original object.
3423          *
3424          * NOTE: Deallocate oobject (due to its entry->object.vm_object being
3425          *       replaced by nobject).
3426          */
3427         vm_object_chain_release(nobject);
3428         vm_object_drop(nobject);
3429         if (bobject && useshadowlist) {
3430                 vm_object_chain_release(bobject);
3431                 vm_object_drop(bobject);
3432         }
3433
3434 #if 0
3435         if (oobject->resident_page_count) {
3436                 kprintf("oobject %p still contains %jd pages!\n",
3437                         oobject, (intmax_t)oobject->resident_page_count);
3438                 for (idx = 0; idx < size; idx++) {
3439                         vm_page_t m;
3440
3441                         m = vm_page_lookup_busy_wait(oobject, offidxstart + idx,
3442                                                      TRUE, "vmpg");
3443                         if (m) {
3444                                 kprintf("oobject %p idx %jd\n",
3445                                         oobject,
3446                                         offidxstart + idx);
3447                                 vm_page_wakeup(m);
3448                         }
3449                 }
3450         }
3451 #endif
3452         /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/
3453         vm_object_deallocate_locked(oobject);
3454 }
3455
3456 /*
3457  * Copies the contents of the source entry to the destination
3458  * entry.  The entries *must* be aligned properly.
3459  *
3460  * The vm_maps must be exclusively locked.
3461  * The vm_map's token must be held.
3462  *
3463  * Because the maps are locked no faults can be in progress during the
3464  * operation.
3465  */
3466 static void
3467 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3468                   vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3469 {
3470         vm_object_t src_object;
3471         vm_object_t oobject;
3472
3473         if (dst_entry->maptype == VM_MAPTYPE_SUBMAP ||
3474             dst_entry->maptype == VM_MAPTYPE_UKSMAP)
3475                 return;
3476         if (src_entry->maptype == VM_MAPTYPE_SUBMAP ||
3477             src_entry->maptype == VM_MAPTYPE_UKSMAP)
3478                 return;
3479
3480         if (src_entry->wired_count == 0) {
3481                 /*
3482                  * If the source entry is marked needs_copy, it is already
3483                  * write-protected.
3484                  *
3485                  * To avoid interacting with a vm_fault that might have
3486                  * released its vm_map, we must acquire the fronting
3487                  * object.
3488                  */
3489                 oobject = src_entry->object.vm_object;
3490                 if (oobject) {
3491                         vm_object_hold(oobject);
3492                         vm_object_chain_acquire(oobject, 0);
3493                 }
3494
3495                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3496                         pmap_protect(src_map->pmap,
3497                             src_entry->start,
3498                             src_entry->end,
3499                             src_entry->protection & ~VM_PROT_WRITE);
3500                 }
3501
3502                 /*
3503                  * Make a copy of the object.
3504                  *
3505                  * The object must be locked prior to checking the object type
3506                  * and for the call to vm_object_collapse() and vm_map_split().
3507                  * We cannot use *_hold() here because the split code will
3508                  * probably try to destroy the object.  The lock is a pool
3509                  * token and doesn't care.
3510                  *
3511                  * We must bump src_map->timestamp when setting
3512                  * MAP_ENTRY_NEEDS_COPY to force any concurrent fault
3513                  * to retry, otherwise the concurrent fault might improperly
3514                  * install a RW pte when its supposed to be a RO(COW) pte.
3515                  * This race can occur because a vnode-backed fault may have
3516                  * to temporarily release the map lock.  This was handled
3517                  * when the caller locked the map exclusively.
3518                  */
3519                 if (oobject) {
3520                         vm_map_split(src_entry, oobject);
3521
3522                         src_object = src_entry->object.vm_object;
3523                         dst_entry->object.vm_object = src_object;
3524                         src_entry->eflags |= (MAP_ENTRY_COW |
3525                                               MAP_ENTRY_NEEDS_COPY);
3526                         dst_entry->eflags |= (MAP_ENTRY_COW |
3527                                               MAP_ENTRY_NEEDS_COPY);
3528                         dst_entry->offset = src_entry->offset;
3529                 } else {
3530                         dst_entry->object.vm_object = NULL;
3531                         dst_entry->offset = 0;
3532                 }
3533                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3534                           dst_entry->end - dst_entry->start,
3535                           src_entry->start);
3536                 if (oobject) {
3537                         vm_object_chain_release(oobject);
3538                         vm_object_drop(oobject);
3539                 }
3540         } else {
3541                 /*
3542                  * Of course, wired down pages can't be set copy-on-write.
3543                  * Cause wired pages to be copied into the new map by
3544                  * simulating faults (the new pages are pageable)
3545                  */
3546                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3547         }
3548 }
3549
3550 /*
3551  * vmspace_fork:
3552  * Create a new process vmspace structure and vm_map
3553  * based on those of an existing process.  The new map
3554  * is based on the old map, according to the inheritance
3555  * values on the regions in that map.
3556  *
3557  * The source map must not be locked.
3558  * No requirements.
3559  */
3560 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3561                           vm_map_entry_t old_entry, int *countp);
3562 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3563                           vm_map_entry_t old_entry, int *countp);
3564
3565 struct vmspace *
3566 vmspace_fork(struct vmspace *vm1)
3567 {
3568         struct vmspace *vm2;
3569         vm_map_t old_map = &vm1->vm_map;
3570         vm_map_t new_map;
3571         vm_map_entry_t old_entry;
3572         int count;
3573
3574         lwkt_gettoken(&vm1->vm_map.token);
3575         vm_map_lock(old_map);
3576
3577         vm2 = vmspace_alloc(old_map->header.start, old_map->header.end);
3578         lwkt_gettoken(&vm2->vm_map.token);
3579
3580         /*
3581          * We must bump the timestamp to force any concurrent fault
3582          * to retry.
3583          */
3584         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3585               (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3586         new_map = &vm2->vm_map; /* XXX */
3587         new_map->timestamp = 1;
3588
3589         vm_map_lock(new_map);
3590
3591         count = 0;
3592         old_entry = old_map->header.next;
3593         while (old_entry != &old_map->header) {
3594                 ++count;
3595                 old_entry = old_entry->next;
3596         }
3597
3598         count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3599
3600         old_entry = old_map->header.next;
3601         while (old_entry != &old_map->header) {
3602                 switch(old_entry->maptype) {
3603                 case VM_MAPTYPE_SUBMAP:
3604                         panic("vm_map_fork: encountered a submap");
3605                         break;
3606                 case VM_MAPTYPE_UKSMAP:
3607                         vmspace_fork_uksmap_entry(old_map, new_map,
3608                                                   old_entry, &count);
3609                         break;
3610                 case VM_MAPTYPE_NORMAL:
3611                 case VM_MAPTYPE_VPAGETABLE:
3612                         vmspace_fork_normal_entry(old_map, new_map,
3613                                                   old_entry, &count);
3614                         break;
3615                 }
3616                 old_entry = old_entry->next;
3617         }
3618
3619         new_map->size = old_map->size;
3620         vm_map_unlock(old_map);
3621         vm_map_unlock(new_map);
3622         vm_map_entry_release(count);
3623
3624         lwkt_reltoken(&vm2->vm_map.token);
3625         lwkt_reltoken(&vm1->vm_map.token);
3626
3627         return (vm2);
3628 }
3629
3630 static
3631 void
3632 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3633                           vm_map_entry_t old_entry, int *countp)
3634 {
3635         vm_map_entry_t new_entry;
3636         vm_object_t object;
3637
3638         switch (old_entry->inheritance) {
3639         case VM_INHERIT_NONE:
3640                 break;
3641         case VM_INHERIT_SHARE:
3642                 /*
3643                  * Clone the entry, creating the shared object if
3644                  * necessary.
3645                  */
3646                 if (old_entry->object.vm_object == NULL)
3647                         vm_map_entry_allocate_object(old_entry);
3648
3649                 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3650                         /*
3651                          * Shadow a map_entry which needs a copy,
3652                          * replacing its object with a new object
3653                          * that points to the old one.  Ask the
3654                          * shadow code to automatically add an
3655                          * additional ref.  We can't do it afterwords
3656                          * because we might race a collapse.  The call
3657                          * to vm_map_entry_shadow() will also clear
3658                          * OBJ_ONEMAPPING.
3659                          */
3660                         vm_map_entry_shadow(old_entry, 1);
3661                 } else if (old_entry->object.vm_object) {
3662                         /*
3663                          * We will make a shared copy of the object,
3664                          * and must clear OBJ_ONEMAPPING.
3665                          *
3666                          * Optimize vnode objects.  OBJ_ONEMAPPING
3667                          * is non-applicable but clear it anyway,
3668                          * and its terminal so we don't have to deal
3669                          * with chains.  Reduces SMP conflicts.
3670                          *
3671                          * XXX assert that object.vm_object != NULL
3672                          *     since we allocate it above.
3673                          */
3674                         object = old_entry->object.vm_object;
3675                         if (object->type == OBJT_VNODE) {
3676                                 vm_object_reference_quick(object);
3677                                 vm_object_clear_flag(object,
3678                                                      OBJ_ONEMAPPING);
3679                         } else {
3680                                 vm_object_hold(object);
3681                                 vm_object_chain_wait(object, 0);
3682                                 vm_object_reference_locked(object);
3683                                 vm_object_clear_flag(object,
3684                                                      OBJ_ONEMAPPING);
3685                                 vm_object_drop(object);
3686                         }
3687                 }
3688
3689                 /*
3690                  * Clone the entry.  We've already bumped the ref on
3691                  * any vm_object.
3692                  */
3693                 new_entry = vm_map_entry_create(new_map, countp);
3694                 *new_entry = *old_entry;
3695                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3696                 new_entry->wired_count = 0;
3697
3698                 /*
3699                  * Insert the entry into the new map -- we know we're
3700                  * inserting at the end of the new map.
3701                  */
3702                 vm_map_entry_link(new_map, new_map->header.prev,
3703                                   new_entry);
3704
3705                 /*
3706                  * Update the physical map
3707                  */
3708                 pmap_copy(new_map->pmap, old_map->pmap,
3709                           new_entry->start,
3710                           (old_entry->end - old_entry->start),
3711                           old_entry->start);
3712                 break;
3713         case VM_INHERIT_COPY:
3714                 /*
3715                  * Clone the entry and link into the map.
3716                  */
3717                 new_entry = vm_map_entry_create(new_map, countp);
3718                 *new_entry = *old_entry;
3719                 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3720                 new_entry->wired_count = 0;
3721                 new_entry->object.vm_object = NULL;
3722                 vm_map_entry_link(new_map, new_map->header.prev,
3723                                   new_entry);
3724                 vm_map_copy_entry(old_map, new_map, old_entry,
3725                                   new_entry);
3726                 break;
3727         }
3728 }
3729
3730 /*
3731  * When forking user-kernel shared maps, the map might change in the
3732  * child so do not try to copy the underlying pmap entries.
3733  */
3734 static
3735 void
3736 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3737                           vm_map_entry_t old_entry, int *countp)
3738 {
3739         vm_map_entry_t new_entry;
3740
3741         new_entry = vm_map_entry_create(new_map, countp);
3742         *new_entry = *old_entry;
3743         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3744         new_entry->wired_count = 0;
3745         vm_map_entry_link(new_map, new_map->header.prev,
3746                           new_entry);
3747 }
3748
3749 /*
3750  * Create an auto-grow stack entry
3751  *
3752  * No requirements.
3753  */
3754 int
3755 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3756               int flags, vm_prot_t prot, vm_prot_t max, int cow)
3757 {
3758         vm_map_entry_t  prev_entry;
3759         vm_map_entry_t  new_stack_entry;
3760         vm_size_t       init_ssize;
3761         int             rv;
3762         int             count;
3763         vm_offset_t     tmpaddr;
3764
3765         cow |= MAP_IS_STACK;
3766
3767         if (max_ssize < sgrowsiz)
3768                 init_ssize = max_ssize;
3769         else
3770                 init_ssize = sgrowsiz;
3771
3772         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3773         vm_map_lock(map);
3774
3775         /*
3776          * Find space for the mapping
3777          */
3778         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3779                 if (vm_map_findspace(map, addrbos, max_ssize, 1,
3780                                      flags, &tmpaddr)) {
3781                         vm_map_unlock(map);
3782                         vm_map_entry_release(count);
3783                         return (KERN_NO_SPACE);
3784                 }
3785                 addrbos = tmpaddr;
3786         }
3787
3788         /* If addr is already mapped, no go */
3789         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3790                 vm_map_unlock(map);
3791                 vm_map_entry_release(count);
3792                 return (KERN_NO_SPACE);
3793         }
3794
3795 #if 0
3796         /* XXX already handled by kern_mmap() */
3797         /* If we would blow our VMEM resource limit, no go */
3798         if (map->size + init_ssize >
3799             curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3800                 vm_map_unlock(map);
3801                 vm_map_entry_release(count);
3802                 return (KERN_NO_SPACE);
3803         }
3804 #endif
3805
3806         /*
3807          * If we can't accomodate max_ssize in the current mapping,
3808          * no go.  However, we need to be aware that subsequent user
3809          * mappings might map into the space we have reserved for
3810          * stack, and currently this space is not protected.
3811          *
3812          * Hopefully we will at least detect this condition
3813          * when we try to grow the stack.
3814          */
3815         if ((prev_entry->next != &map->header) &&
3816             (prev_entry->next->start < addrbos + max_ssize)) {
3817                 vm_map_unlock(map);
3818                 vm_map_entry_release(count);
3819                 return (KERN_NO_SPACE);
3820         }
3821
3822         /*
3823          * We initially map a stack of only init_ssize.  We will
3824          * grow as needed later.  Since this is to be a grow
3825          * down stack, we map at the top of the range.
3826          *
3827          * Note: we would normally expect prot and max to be
3828          * VM_PROT_ALL, and cow to be 0.  Possibly we should
3829          * eliminate these as input parameters, and just
3830          * pass these values here in the insert call.
3831          */
3832         rv = vm_map_insert(map, &count, NULL, NULL,
3833                            0, addrbos + max_ssize - init_ssize,
3834                            addrbos + max_ssize,
3835                            VM_MAPTYPE_NORMAL,
3836                            VM_SUBSYS_STACK, prot, max, cow);
3837
3838         /* Now set the avail_ssize amount */
3839         if (rv == KERN_SUCCESS) {
3840                 if (prev_entry != &map->header)
3841                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count);
3842                 new_stack_entry = prev_entry->next;
3843                 if (new_stack_entry->end   != addrbos + max_ssize ||
3844                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
3845                         panic ("Bad entry start/end for new stack entry");
3846                 else
3847                         new_stack_entry->aux.avail_ssize = max_ssize - init_ssize;
3848         }
3849
3850         vm_map_unlock(map);
3851         vm_map_entry_release(count);
3852         return (rv);
3853 }
3854
3855 /*
3856  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3857  * desired address is already mapped, or if we successfully grow
3858  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3859  * stack range (this is strange, but preserves compatibility with
3860  * the grow function in vm_machdep.c).
3861  *
3862  * No requirements.
3863  */
3864 int
3865 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3866 {
3867         vm_map_entry_t prev_entry;
3868         vm_map_entry_t stack_entry;
3869         vm_map_entry_t new_stack_entry;
3870         struct vmspace *vm;
3871         struct lwp *lp;
3872         struct proc *p;
3873         vm_offset_t    end;
3874         int grow_amount;
3875         int rv = KERN_SUCCESS;
3876         int is_procstack;
3877         int use_read_lock = 1;
3878         int count;
3879
3880         /*
3881          * Find the vm
3882          */
3883         lp = curthread->td_lwp;
3884         p = curthread->td_proc;
3885         KKASSERT(lp != NULL);
3886         vm = lp->lwp_vmspace;
3887
3888         /*
3889          * Growstack is only allowed on the current process.  We disallow
3890          * other use cases, e.g. trying to access memory via procfs that
3891          * the stack hasn't grown into.
3892          */
3893         if (map != &vm->vm_map) {
3894                 return KERN_FAILURE;
3895         }
3896
3897         count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3898 Retry:
3899         if (use_read_lock)
3900                 vm_map_lock_read(map);
3901         else
3902                 vm_map_lock(map);
3903
3904         /* If addr is already in the entry range, no need to grow.*/
3905         if (vm_map_lookup_entry(map, addr, &prev_entry))
3906                 goto done;
3907
3908         if ((stack_entry = prev_entry->next) == &map->header)
3909                 goto done;
3910         if (prev_entry == &map->header)
3911                 end = stack_entry->start - stack_entry->aux.avail_ssize;
3912         else
3913                 end = prev_entry->end;
3914
3915         /*
3916          * This next test mimics the old grow function in vm_machdep.c.
3917          * It really doesn't quite make sense, but we do it anyway
3918          * for compatibility.
3919          *
3920          * If not growable stack, return success.  This signals the
3921          * caller to proceed as he would normally with normal vm.
3922          */
3923         if (stack_entry->aux.avail_ssize < 1 ||
3924             addr >= stack_entry->start ||
3925             addr <  stack_entry->start - stack_entry->aux.avail_ssize) {
3926                 goto done;
3927         }
3928
3929         /* Find the minimum grow amount */
3930         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
3931         if (grow_amount > stack_entry->aux.avail_ssize) {
3932                 rv = KERN_NO_SPACE;
3933                 goto done;
3934         }
3935
3936         /*
3937          * If there is no longer enough space between the entries
3938          * nogo, and adjust the available space.  Note: this
3939          * should only happen if the user has mapped into the
3940          * stack area after the stack was created, and is
3941          * probably an error.
3942          *
3943          * This also effectively destroys any guard page the user
3944          * might have intended by limiting the stack size.
3945          */
3946         if (grow_amount > stack_entry->start - end) {
3947                 if (use_read_lock && vm_map_lock_upgrade(map)) {
3948                         /* lost lock */
3949                         use_read_lock = 0;
3950                         goto Retry;
3951                 }
3952                 use_read_lock = 0;
3953                 stack_entry->aux.avail_ssize = stack_entry->start - end;
3954                 rv = KERN_NO_SPACE;
3955                 goto done;
3956         }
3957
3958         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
3959
3960         /* If this is the main process stack, see if we're over the
3961          * stack limit.
3962          */
3963         if (is_procstack && (vm->vm_ssize + grow_amount >
3964                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3965                 rv = KERN_NO_SPACE;
3966                 goto done;
3967         }
3968
3969         /* Round up the grow amount modulo SGROWSIZ */
3970         grow_amount = roundup (grow_amount, sgrowsiz);
3971         if (grow_amount > stack_entry->aux.avail_ssize) {
3972                 grow_amount = stack_entry->aux.avail_ssize;
3973         }
3974         if (is_procstack && (vm->vm_ssize + grow_amount >
3975                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
3976                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
3977         }
3978
3979         /* If we would blow our VMEM resource limit, no go */
3980         if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3981                 rv = KERN_NO_SPACE;
3982                 goto done;
3983         }
3984
3985         if (use_read_lock && vm_map_lock_upgrade(map)) {
3986                 /* lost lock */
3987                 use_read_lock = 0;
3988                 goto Retry;
3989         }
3990         use_read_lock = 0;
3991
3992         /* Get the preliminary new entry start value */
3993         addr = stack_entry->start - grow_amount;
3994
3995         /* If this puts us into the previous entry, cut back our growth
3996          * to the available space.  Also, see the note above.
3997          */
3998         if (addr < end) {
3999                 stack_entry->aux.avail_ssize = stack_entry->start - end;
4000                 addr = end;
4001         }
4002
4003         rv = vm_map_insert(map, &count, NULL, NULL,
4004                            0, addr, stack_entry->start,
4005                            VM_MAPTYPE_NORMAL,
4006                            VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4007
4008         /* Adjust the available stack space by the amount we grew. */
4009         if (rv == KERN_SUCCESS) {
4010                 if (prev_entry != &map->header)
4011                         vm_map_clip_end(map, prev_entry, addr, &count);
4012                 new_stack_entry = prev_entry->next;
4013                 if (new_stack_entry->end   != stack_entry->start  ||
4014                     new_stack_entry->start != addr)
4015                         panic ("Bad stack grow start/end in new stack entry");
4016                 else {
4017                         new_stack_entry->aux.avail_ssize =
4018                                 stack_entry->aux.avail_ssize -
4019                                 (new_stack_entry->end - new_stack_entry->start);
4020                         if (is_procstack) {
4021                                 vm->vm_ssize += new_stack_entry->end -
4022                                                 new_stack_entry->start;
4023                         }
4024                 }
4025
4026                 if (map->flags & MAP_WIREFUTURE)
4027                         vm_map_unwire(map, new_stack_entry->start,
4028                                       new_stack_entry->end, FALSE);
4029         }
4030
4031 done:
4032         if (use_read_lock)
4033                 vm_map_unlock_read(map);
4034         else
4035                 vm_map_unlock(map);
4036         vm_map_entry_release(count);
4037         return (rv);
4038 }
4039
4040 /*
4041  * Unshare the specified VM space for exec.  If other processes are
4042  * mapped to it, then create a new one.  The new vmspace is null.
4043  *
4044  * No requirements.
4045  */
4046 void
4047 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4048 {
4049         struct vmspace *oldvmspace = p->p_vmspace;
4050         struct vmspace *newvmspace;
4051         vm_map_t map = &p->p_vmspace->vm_map;
4052
4053         /*
4054          * If we are execing a resident vmspace we fork it, otherwise
4055          * we create a new vmspace.  Note that exitingcnt is not
4056          * copied to the new vmspace.
4057          */
4058         lwkt_gettoken(&oldvmspace->vm_map.token);
4059         if (vmcopy)  {
4060                 newvmspace = vmspace_fork(vmcopy);
4061                 lwkt_gettoken(&newvmspace->vm_map.token);
4062         } else {
4063                 newvmspace = vmspace_alloc(map->header.start, map->header.end);
4064                 lwkt_gettoken(&newvmspace->vm_map.token);
4065                 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4066                       (caddr_t)&oldvmspace->vm_endcopy -
4067                        (caddr_t)&oldvmspace->vm_startcopy);
4068         }
4069
4070         /*
4071          * Finish initializing the vmspace before assigning it
4072          * to the process.  The vmspace will become the current vmspace
4073          * if p == curproc.
4074          */
4075         pmap_pinit2(vmspace_pmap(newvmspace));
4076         pmap_replacevm(p, newvmspace, 0);
4077         lwkt_reltoken(&newvmspace->vm_map.token);
4078         lwkt_reltoken(&oldvmspace->vm_map.token);
4079         vmspace_rel(oldvmspace);
4080 }
4081
4082 /*
4083  * Unshare the specified VM space for forcing COW.  This
4084  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4085  */
4086 void
4087 vmspace_unshare(struct proc *p)
4088 {
4089         struct vmspace *oldvmspace = p->p_vmspace;
4090         struct vmspace *newvmspace;
4091
4092         lwkt_gettoken(&oldvmspace->vm_map.token);
4093         if (vmspace_getrefs(oldvmspace) == 1) {
4094                 lwkt_reltoken(&oldvmspace->vm_map.token);
4095                 return;
4096         }
4097         newvmspace = vmspace_fork(oldvmspace);
4098         lwkt_gettoken(&newvmspace->vm_map.token);
4099         pmap_pinit2(vmspace_pmap(newvmspace));
4100         pmap_replacevm(p, newvmspace, 0);
4101         lwkt_reltoken(&newvmspace->vm_map.token);
4102         lwkt_reltoken(&oldvmspace->vm_map.token);
4103         vmspace_rel(oldvmspace);
4104 }
4105
4106 /*
4107  * vm_map_hint: return the beginning of the best area suitable for
4108  * creating a new mapping with "prot" protection.
4109  *
4110  * No requirements.
4111  */
4112 vm_offset_t
4113 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4114 {
4115         struct vmspace *vms = p->p_vmspace;
4116         struct rlimit limit;
4117         rlim_t dsiz;
4118
4119         /*
4120          * Acquire datasize limit for mmap() operation,
4121          * calculate nearest power of 2.
4122          */
4123         if (kern_getrlimit(RLIMIT_DATA, &limit))
4124                 limit.rlim_cur = maxdsiz;
4125         dsiz = limit.rlim_cur;
4126
4127         if (!randomize_mmap || addr != 0) {
4128                 /*
4129                  * Set a reasonable start point for the hint if it was
4130                  * not specified or if it falls within the heap space.
4131                  * Hinted mmap()s do not allocate out of the heap space.
4132                  */
4133                 if (addr == 0 ||
4134                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4135                      addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4136                         addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4137                 }
4138
4139                 return addr;
4140         }
4141
4142         /*
4143          * randomize_mmap && addr == 0.  For now randomize the
4144          * address within a dsiz range beyond the data limit.
4145          */
4146         addr = (vm_offset_t)vms->vm_daddr + dsiz;
4147         if (dsiz)
4148                 addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4149         return (round_page(addr));
4150 }
4151
4152 /*
4153  * Finds the VM object, offset, and protection for a given virtual address
4154  * in the specified map, assuming a page fault of the type specified.
4155  *
4156  * Leaves the map in question locked for read; return values are guaranteed
4157  * until a vm_map_lookup_done call is performed.  Note that the map argument
4158  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4159  *
4160  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4161  * that fast.
4162  *
4163  * If a lookup is requested with "write protection" specified, the map may
4164  * be changed to perform virtual copying operations, although the data
4165  * referenced will remain the same.
4166  *
4167  * No requirements.
4168  */
4169 int
4170 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4171               vm_offset_t vaddr,
4172               vm_prot_t fault_typea,
4173               vm_map_entry_t *out_entry,        /* OUT */
4174               vm_object_t *object,              /* OUT */
4175               vm_pindex_t *pindex,              /* OUT */
4176               vm_prot_t *out_prot,              /* OUT */
4177               int *wflags)                      /* OUT */
4178 {
4179         vm_map_entry_t entry;
4180         vm_map_t map = *var_map;
4181         vm_prot_t prot;
4182         vm_prot_t fault_type = fault_typea;
4183         int use_read_lock = 1;
4184         int rv = KERN_SUCCESS;
4185         int count;
4186         thread_t td = curthread;
4187
4188         /*
4189          * vm_map_entry_reserve() implements an important mitigation
4190          * against mmap() span running the kernel out of vm_map_entry
4191          * structures, but it can also cause an infinite call recursion.
4192          * Use td_nest_count to prevent an infinite recursion (allows
4193          * the vm_map code to dig into the pcpu vm_map_entry reserve).
4194          */
4195         count = 0;
4196         if (td->td_nest_count == 0) {
4197                 ++td->td_nest_count;
4198                 count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4199                 --td->td_nest_count;
4200         }
4201 RetryLookup:
4202         if (use_read_lock)
4203                 vm_map_lock_read(map);
4204         else
4205                 vm_map_lock(map);
4206
4207         /*
4208          * Always do a full lookup.  The hint doesn't get us much anymore
4209          * now that the map is RB'd.
4210          */
4211         cpu_ccfence();
4212         *out_entry = &map->header;
4213         *object = NULL;
4214
4215         {
4216                 vm_map_entry_t tmp_entry;
4217
4218                 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4219                         rv = KERN_INVALID_ADDRESS;
4220                         goto done;
4221                 }
4222                 entry = tmp_entry;
4223                 *out_entry = entry;
4224         }
4225
4226         /*
4227          * Handle submaps.
4228          */
4229         if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4230                 vm_map_t old_map = map;
4231
4232                 *var_map = map = entry->object.sub_map;
4233                 if (use_read_lock)
4234                         vm_map_unlock_read(old_map);
4235                 else
4236                         vm_map_unlock(old_map);
4237                 use_read_lock = 1;
4238                 goto RetryLookup;
4239         }
4240
4241         /*
4242          * Check whether this task is allowed to have this page.
4243          * Note the special case for MAP_ENTRY_COW pages with an override.
4244          * This is to implement a forced COW for debuggers.
4245          */
4246         if (fault_type & VM_PROT_OVERRIDE_WRITE)
4247                 prot = entry->max_protection;
4248         else
4249                 prot = entry->protection;
4250
4251         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4252         if ((fault_type & prot) != fault_type) {
4253                 rv = KERN_PROTECTION_FAILURE;
4254                 goto done;
4255         }
4256
4257         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4258             (entry->eflags & MAP_ENTRY_COW) &&
4259             (fault_type & VM_PROT_WRITE) &&
4260             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4261                 rv = KERN_PROTECTION_FAILURE;
4262                 goto done;
4263         }
4264
4265         /*
4266          * If this page is not pageable, we have to get it for all possible
4267          * accesses.
4268          */
4269         *wflags = 0;
4270         if (entry->wired_count) {
4271                 *wflags |= FW_WIRED;
4272                 prot = fault_type = entry->protection;
4273         }
4274
4275         /*
4276          * Virtual page tables may need to update the accessed (A) bit
4277          * in a page table entry.  Upgrade the fault to a write fault for
4278          * that case if the map will support it.  If the map does not support
4279          * it the page table entry simply will not be updated.
4280          */
4281         if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4282                 if (prot & VM_PROT_WRITE)
4283                         fault_type |= VM_PROT_WRITE;
4284         }
4285
4286         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4287             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4288                 if ((prot & VM_PROT_WRITE) == 0)
4289                         fault_type |= VM_PROT_WRITE;
4290         }
4291
4292         /*
4293          * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4294          */
4295         if (entry->maptype != VM_MAPTYPE_NORMAL &&
4296             entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4297                 *object = NULL;
4298                 goto skip;
4299         }
4300
4301         /*
4302          * If the entry was copy-on-write, we either ...
4303          */
4304         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4305                 /*
4306                  * If we want to write the page, we may as well handle that
4307                  * now since we've got the map locked.
4308                  *
4309                  * If we don't need to write the page, we just demote the
4310                  * permissions allowed.
4311                  */
4312                 if (fault_type & VM_PROT_WRITE) {
4313                         /*
4314                          * Not allowed if TDF_NOFAULT is set as the shadowing
4315                          * operation can deadlock against the faulting
4316                          * function due to the copy-on-write.
4317                          */
4318                         if (curthread->td_flags & TDF_NOFAULT) {
4319                                 rv = KERN_FAILURE_NOFAULT;
4320                                 goto done;
4321                         }
4322
4323                         /*
4324                          * Make a new object, and place it in the object
4325                          * chain.  Note that no new references have appeared
4326                          * -- one just moved from the map to the new
4327                          * object.
4328                          */
4329                         if (use_read_lock && vm_map_lock_upgrade(map)) {
4330                                 /* lost lock */
4331                                 use_read_lock = 0;
4332                                 goto RetryLookup;
4333                         }
4334                         use_read_lock = 0;
4335                         vm_map_entry_shadow(entry, 0);
4336                         *wflags |= FW_DIDCOW;
4337                 } else {
4338                         /*
4339                          * We're attempting to read a copy-on-write page --
4340                          * don't allow writes.
4341                          */
4342                         prot &= ~VM_PROT_WRITE;
4343                 }
4344         }
4345
4346         /*
4347          * Create an object if necessary.  This code also handles
4348          * partitioning large entries to improve vm_fault performance.
4349          */
4350         if (entry->object.vm_object == NULL && !map->system_map) {
4351                 if (use_read_lock && vm_map_lock_upgrade(map))  {
4352                         /* lost lock */
4353                         use_read_lock = 0;
4354                         goto RetryLookup;
4355                 }
4356                 use_read_lock = 0;
4357
4358                 /*
4359                  * Partition large entries, giving each its own VM object,
4360                  * to improve concurrent fault performance.  This is only
4361                  * applicable to userspace.
4362                  */
4363                 if (map != &kernel_map &&
4364                     entry->maptype == VM_MAPTYPE_NORMAL &&
4365                     ((entry->start ^ entry->end) & ~MAP_ENTRY_PARTITION_MASK) &&
4366                     vm_map_partition_enable) {
4367                         if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4368                                 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4369                                 ++mycpu->gd_cnt.v_intrans_coll;
4370                                 ++mycpu->gd_cnt.v_intrans_wait;
4371                                 vm_map_transition_wait(map, 0);
4372                                 goto RetryLookup;
4373                         }
4374                         vm_map_entry_partition(map, entry, vaddr, &count);
4375                 }
4376                 vm_map_entry_allocate_object(entry);
4377         }
4378
4379         /*
4380          * Return the object/offset from this entry.  If the entry was
4381          * copy-on-write or empty, it has been fixed up.
4382          */
4383         *object = entry->object.vm_object;
4384
4385 skip:
4386         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4387
4388         /*
4389          * Return whether this is the only map sharing this data.  On
4390          * success we return with a read lock held on the map.  On failure
4391          * we return with the map unlocked.
4392          */
4393         *out_prot = prot;
4394 done:
4395         if (rv == KERN_SUCCESS) {
4396                 if (use_read_lock == 0)
4397                         vm_map_lock_downgrade(map);
4398         } else if (use_read_lock) {
4399                 vm_map_unlock_read(map);
4400         } else {
4401                 vm_map_unlock(map);
4402         }
4403         if (count > 0)
4404                 vm_map_entry_release(count);
4405
4406         return (rv);
4407 }
4408
4409 /*
4410  * Releases locks acquired by a vm_map_lookup()
4411  * (according to the handle returned by that lookup).
4412  *
4413  * No other requirements.
4414  */
4415 void
4416 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4417 {
4418         /*
4419          * Unlock the main-level map
4420          */
4421         vm_map_unlock_read(map);
4422         if (count)
4423                 vm_map_entry_release(count);
4424 }
4425
4426 static void
4427 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4428                        vm_offset_t vaddr, int *countp)
4429 {
4430         vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4431         vm_map_clip_start(map, entry, vaddr, countp);
4432         vaddr += MAP_ENTRY_PARTITION_SIZE;
4433         vm_map_clip_end(map, entry, vaddr, countp);
4434 }
4435
4436 /*
4437  * Quick hack, needs some help to make it more SMP friendly.
4438  */
4439 void
4440 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4441                  vm_offset_t ran_beg, vm_offset_t ran_end)
4442 {
4443         struct vm_map_ilock *scan;
4444
4445         ilock->ran_beg = ran_beg;
4446         ilock->ran_end = ran_end;
4447         ilock->flags = 0;
4448
4449         spin_lock(&map->ilock_spin);
4450 restart:
4451         for (scan = map->ilock_base; scan; scan = scan->next) {
4452                 if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4453                         scan->flags |= ILOCK_WAITING;
4454                         ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4455                         goto restart;
4456                 }
4457         }
4458         ilock->next = map->ilock_base;
4459         map->ilock_base = ilock;
4460         spin_unlock(&map->ilock_spin);
4461 }
4462
4463 void
4464 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4465 {
4466         struct vm_map_ilock *scan;
4467         struct vm_map_ilock **scanp;
4468
4469         spin_lock(&map->ilock_spin);
4470         scanp = &map->ilock_base;
4471         while ((scan = *scanp) != NULL) {
4472                 if (scan == ilock) {
4473                         *scanp = ilock->next;
4474                         spin_unlock(&map->ilock_spin);
4475                         if (ilock->flags & ILOCK_WAITING)
4476                                 wakeup(ilock);
4477                         return;
4478                 }
4479                 scanp = &scan->next;
4480         }
4481         spin_unlock(&map->ilock_spin);
4482         panic("vm_map_deinterlock: missing ilock!");
4483 }
4484
4485 #include "opt_ddb.h"
4486 #ifdef DDB
4487 #include <ddb/ddb.h>
4488
4489 /*
4490  * Debugging only
4491  */
4492 DB_SHOW_COMMAND(map, vm_map_print)
4493 {
4494         static int nlines;
4495         /* XXX convert args. */
4496         vm_map_t map = (vm_map_t)addr;
4497         boolean_t full = have_addr;
4498
4499         vm_map_entry_t entry;
4500
4501         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4502             (void *)map,
4503             (void *)map->pmap, map->nentries, map->timestamp);
4504         nlines++;
4505
4506         if (!full && db_indent)
4507                 return;
4508
4509         db_indent += 2;
4510         for (entry = map->header.next; entry != &map->header;
4511             entry = entry->next) {
4512                 db_iprintf("map entry %p: start=%p, end=%p\n",
4513                     (void *)entry, (void *)entry->start, (void *)entry->end);
4514                 nlines++;
4515                 {
4516                         static char *inheritance_name[4] =
4517                         {"share", "copy", "none", "donate_copy"};
4518
4519                         db_iprintf(" prot=%x/%x/%s",
4520                             entry->protection,
4521                             entry->max_protection,
4522                             inheritance_name[(int)(unsigned char)
4523                                                 entry->inheritance]);
4524                         if (entry->wired_count != 0)
4525                                 db_printf(", wired");
4526                 }
4527                 switch(entry->maptype) {
4528                 case VM_MAPTYPE_SUBMAP:
4529                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4530                         db_printf(", share=%p, offset=0x%lx\n",
4531                             (void *)entry->object.sub_map,
4532                             (long)entry->offset);
4533                         nlines++;
4534                         if ((entry->prev == &map->header) ||
4535                             (entry->prev->object.sub_map !=
4536                                 entry->object.sub_map)) {
4537                                 db_indent += 2;
4538                                 vm_map_print((db_expr_t)(intptr_t)
4539                                              entry->object.sub_map,
4540                                              full, 0, NULL);
4541                                 db_indent -= 2;
4542                         }
4543                         break;
4544                 case VM_MAPTYPE_NORMAL:
4545                 case VM_MAPTYPE_VPAGETABLE:
4546                         /* XXX no %qd in kernel.  Truncate entry->offset. */
4547                         db_printf(", object=%p, offset=0x%lx",
4548                             (void *)entry->object.vm_object,
4549                             (long)entry->offset);
4550                         if (entry->eflags & MAP_ENTRY_COW)
4551                                 db_printf(", copy (%s)",
4552                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4553                         db_printf("\n");
4554                         nlines++;
4555
4556                         if ((entry->prev == &map->header) ||
4557                             (entry->prev->object.vm_object !=
4558                                 entry->object.vm_object)) {
4559                                 db_indent += 2;
4560                                 vm_object_print((db_expr_t)(intptr_t)
4561                                                 entry->object.vm_object,
4562                                                 full, 0, NULL);
4563                                 nlines += 4;
4564                                 db_indent -= 2;
4565                         }
4566                         break;
4567                 case VM_MAPTYPE_UKSMAP:
4568                         db_printf(", uksmap=%p, offset=0x%lx",
4569                             (void *)entry->object.uksmap,
4570                             (long)entry->offset);
4571                         if (entry->eflags & MAP_ENTRY_COW)
4572                                 db_printf(", copy (%s)",
4573                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4574                         db_printf("\n");
4575                         nlines++;
4576                         break;
4577                 default:
4578                         break;
4579                 }
4580         }
4581         db_indent -= 2;
4582         if (db_indent == 0)
4583                 nlines = 0;
4584 }
4585
4586 /*
4587  * Debugging only
4588  */
4589 DB_SHOW_COMMAND(procvm, procvm)
4590 {
4591         struct proc *p;
4592
4593         if (have_addr) {
4594                 p = (struct proc *) addr;
4595         } else {
4596                 p = curproc;
4597         }
4598
4599         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4600             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4601             (void *)vmspace_pmap(p->p_vmspace));
4602
4603         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4604 }
4605
4606 #endif /* DDB */