usb4bsd: implement pause function with lock and serializer
[dragonfly.git] / sys / vm / vm_swapcache.c
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36
37 /*
38  * Implement the swapcache daemon.  When enabled swap is assumed to be
39  * configured on a fast storage device such as a SSD.  Swap is assigned
40  * to clean vnode-backed pages in the inactive queue, clustered by object
41  * if possible, and written out.  The swap assignment sticks around even
42  * after the underlying pages have been recycled.
43  *
44  * The daemon manages write bandwidth based on sysctl settings to control
45  * wear on the SSD.
46  *
47  * The vnode strategy code will check for the swap assignments and divert
48  * reads to the swap device when the data is present in the swapcache.
49  *
50  * This operates on both regular files and the block device vnodes used by
51  * filesystems to manage meta-data.
52  */
53
54 #include "opt_vm.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/proc.h>
59 #include <sys/kthread.h>
60 #include <sys/resourcevar.h>
61 #include <sys/signalvar.h>
62 #include <sys/vnode.h>
63 #include <sys/vmmeter.h>
64 #include <sys/sysctl.h>
65 #include <sys/eventhandler.h>
66
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <sys/lock.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_pageout.h>
74 #include <vm/vm_pager.h>
75 #include <vm/swap_pager.h>
76 #include <vm/vm_extern.h>
77
78 #include <sys/thread2.h>
79 #include <sys/spinlock2.h>
80 #include <vm/vm_page2.h>
81
82 /* the kernel process "vm_pageout"*/
83 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
84 static int vm_swapcache_test(vm_page_t m);
85 static int vm_swapcache_writing_heuristic(void);
86 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
87 static void vm_swapcache_cleaning(vm_object_t marker, int *swindexp);
88 static void vm_swapcache_movemarker(vm_object_t marker, int swindex,
89                                 vm_object_t object);
90 struct thread *swapcached_thread;
91
92 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
93
94 int vm_swapcache_read_enable;
95 int vm_swapcache_inactive_heuristic;
96 static int vm_swapcache_sleep;
97 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8;
98 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4;
99 static int vm_swapcache_data_enable = 0;
100 static int vm_swapcache_meta_enable = 0;
101 static int vm_swapcache_maxswappct = 75;
102 static int vm_swapcache_hysteresis;
103 static int vm_swapcache_min_hysteresis;
104 int vm_swapcache_use_chflags = 1;       /* require chflags cache */
105 static int64_t vm_swapcache_minburst = 10000000LL;      /* 10MB */
106 static int64_t vm_swapcache_curburst = 4000000000LL;    /* 4G after boot */
107 static int64_t vm_swapcache_maxburst = 2000000000LL;    /* 2G nominal max */
108 static int64_t vm_swapcache_accrate = 100000LL;         /* 100K/s */
109 static int64_t vm_swapcache_write_count;
110 static int64_t vm_swapcache_maxfilesize;
111 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
112
113 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
114         CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
115 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
116         CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
117
118 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
119         CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
120 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
121         CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
122 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
123         CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
124 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
125         CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
126 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
127         CTLFLAG_RD, &vm_swapcache_hysteresis, 0, "");
128 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis,
129         CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, "");
130 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
131         CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
132
133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
134         CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
136         CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
138         CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
139 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
140         CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
142         CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
143 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
144         CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
145 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
146         CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
147
148 #define SWAPMAX(adj)    \
149         ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
150
151 /*
152  * When shutting down the machine we want to stop swapcache operation
153  * immediately so swap is not accessed after devices have been shuttered.
154  */
155 static void
156 shutdown_swapcache(void *arg __unused)
157 {
158         vm_swapcache_read_enable = 0;
159         vm_swapcache_data_enable = 0;
160         vm_swapcache_meta_enable = 0;
161         wakeup(&vm_swapcache_sleep);    /* shortcut 5-second wait */
162 }
163
164 /*
165  * vm_swapcached is the high level pageout daemon.
166  *
167  * No requirements.
168  */
169 static void
170 vm_swapcached_thread(void)
171 {
172         enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
173         enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
174         static struct vm_page page_marker[PQ_L2_SIZE];
175         static struct vm_object swmarker;
176         static int swindex;
177         int q;
178
179         /*
180          * Thread setup
181          */
182         curthread->td_flags |= TDF_SYSTHREAD;
183         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
184                               swapcached_thread, SHUTDOWN_PRI_FIRST);
185         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
186                               NULL, SHUTDOWN_PRI_SECOND);
187
188         /*
189          * Initialize our marker for the inactive scan (SWAPC_WRITING)
190          */
191         bzero(&page_marker, sizeof(page_marker));
192         for (q = 0; q < PQ_L2_SIZE; ++q) {
193                 page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
194                 page_marker[q].queue = PQ_INACTIVE + q;
195                 page_marker[q].pc = q;
196                 page_marker[q].wire_count = 1;
197                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
198                 TAILQ_INSERT_HEAD(
199                         &vm_page_queues[PQ_INACTIVE + q].pl,
200                         &page_marker[q], pageq);
201                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
202         }
203
204         vm_swapcache_min_hysteresis = 1024;
205         vm_swapcache_hysteresis = vm_swapcache_min_hysteresis;
206         vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis;
207
208         /*
209          * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
210          */
211         bzero(&swmarker, sizeof(swmarker));
212         swmarker.type = OBJT_MARKER;
213         swindex = 0;
214         lwkt_gettoken(&vmobj_tokens[swindex]);
215         TAILQ_INSERT_HEAD(&vm_object_lists[swindex],
216                           &swmarker, object_list);
217         lwkt_reltoken(&vmobj_tokens[swindex]);
218
219         for (;;) {
220                 int reached_end;
221                 int scount;
222                 int count;
223
224                 /*
225                  * Handle shutdown
226                  */
227                 kproc_suspend_loop();
228
229                 /*
230                  * Check every 5 seconds when not enabled or if no swap
231                  * is present.
232                  */
233                 if ((vm_swapcache_data_enable == 0 &&
234                      vm_swapcache_meta_enable == 0) ||
235                     vm_swap_max == 0) {
236                         tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
237                         continue;
238                 }
239
240                 /*
241                  * Polling rate when enabled is approximately 10 hz.
242                  */
243                 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
244
245                 /*
246                  * State hysteresis.  Generate write activity up to 75% of
247                  * swap, then clean out swap assignments down to 70%, then
248                  * repeat.
249                  */
250                 if (state == SWAPC_WRITING) {
251                         if (vm_swap_cache_use > SWAPMAX(0))
252                                 state = SWAPC_CLEANING;
253                 } else {
254                         if (vm_swap_cache_use < SWAPMAX(-10))
255                                 state = SWAPC_WRITING;
256                 }
257
258                 /*
259                  * We are allowed to continue accumulating burst value
260                  * in either state.  Allow the user to set curburst > maxburst
261                  * for the initial load-in.
262                  */
263                 if (vm_swapcache_curburst < vm_swapcache_maxburst) {
264                         vm_swapcache_curburst += vm_swapcache_accrate / 10;
265                         if (vm_swapcache_curburst > vm_swapcache_maxburst)
266                                 vm_swapcache_curburst = vm_swapcache_maxburst;
267                 }
268
269                 /*
270                  * We don't want to nickle-and-dime the scan as that will
271                  * create unnecessary fragmentation.  The minimum burst
272                  * is one-seconds worth of accumulation.
273                  */
274                 if (state != SWAPC_WRITING) {
275                         vm_swapcache_cleaning(&swmarker, &swindex);
276                         continue;
277                 }
278                 if (vm_swapcache_curburst < vm_swapcache_accrate)
279                         continue;
280
281                 reached_end = 0;
282                 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2;
283                 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2;
284
285                 if (burst == SWAPB_BURSTING) {
286                         if (vm_swapcache_writing_heuristic()) {
287                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
288                                         reached_end +=
289                                                 vm_swapcache_writing(
290                                                         &page_marker[q],
291                                                         count,
292                                                         scount);
293                                 }
294                         }
295                         if (vm_swapcache_curburst <= 0)
296                                 burst = SWAPB_RECOVERING;
297                 } else if (vm_swapcache_curburst > vm_swapcache_minburst) {
298                         if (vm_swapcache_writing_heuristic()) {
299                                 for (q = 0; q < PQ_L2_SIZE; ++q) {
300                                         reached_end +=
301                                                 vm_swapcache_writing(
302                                                         &page_marker[q],
303                                                         count,
304                                                         scount);
305                                 }
306                         }
307                         burst = SWAPB_BURSTING;
308                 }
309                 if (reached_end == PQ_L2_SIZE) {
310                         vm_swapcache_inactive_heuristic =
311                                 -vm_swapcache_hysteresis;
312                 }
313         }
314
315         /*
316          * Cleanup (NOT REACHED)
317          */
318         for (q = 0; q < PQ_L2_SIZE; ++q) {
319                 vm_page_queues_spin_lock(PQ_INACTIVE + q);
320                 TAILQ_REMOVE(
321                         &vm_page_queues[PQ_INACTIVE + q].pl,
322                         &page_marker[q], pageq);
323                 vm_page_queues_spin_unlock(PQ_INACTIVE + q);
324         }
325
326         lwkt_gettoken(&vmobj_tokens[swindex]);
327         TAILQ_REMOVE(&vm_object_lists[swindex], &swmarker, object_list);
328         lwkt_reltoken(&vmobj_tokens[swindex]);
329 }
330
331 static struct kproc_desc swpc_kp = {
332         "swapcached",
333         vm_swapcached_thread,
334         &swapcached_thread
335 };
336 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp)
337
338 /*
339  * Deal with an overflow of the heuristic counter or if the user
340  * manually changes the hysteresis.
341  *
342  * Try to avoid small incremental pageouts by waiting for enough
343  * pages to buildup in the inactive queue to hopefully get a good
344  * burst in.  This heuristic is bumped by the VM system and reset
345  * when our scan hits the end of the queue.
346  *
347  * Return TRUE if we need to take a writing pass.
348  */
349 static int
350 vm_swapcache_writing_heuristic(void)
351 {
352         int hyst;
353
354         hyst = vmstats.v_inactive_count / 4;
355         if (hyst < vm_swapcache_min_hysteresis)
356                 hyst = vm_swapcache_min_hysteresis;
357         cpu_ccfence();
358         vm_swapcache_hysteresis = hyst;
359
360         if (vm_swapcache_inactive_heuristic < -hyst)
361                 vm_swapcache_inactive_heuristic = -hyst;
362
363         return (vm_swapcache_inactive_heuristic >= 0);
364 }
365
366 /*
367  * Take a writing pass on one of the inactive queues, return non-zero if
368  * we hit the end of the queue.
369  */
370 static int
371 vm_swapcache_writing(vm_page_t marker, int count, int scount)
372 {
373         vm_object_t object;
374         struct vnode *vp;
375         vm_page_t m;
376         int isblkdev;
377
378         /*
379          * Scan the inactive queue from our marker to locate
380          * suitable pages to push to the swap cache.
381          *
382          * We are looking for clean vnode-backed pages.
383          */
384         vm_page_queues_spin_lock(marker->queue);
385         while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
386                count > 0 && scount-- > 0) {
387                 KKASSERT(m->queue == marker->queue);
388
389                 if (vm_swapcache_curburst < 0)
390                         break;
391                 TAILQ_REMOVE(
392                         &vm_page_queues[marker->queue].pl, marker, pageq);
393                 TAILQ_INSERT_AFTER(
394                         &vm_page_queues[marker->queue].pl, m, marker, pageq);
395
396                 /*
397                  * Ignore markers and ignore pages that already have a swap
398                  * assignment.
399                  */
400                 if (m->flags & (PG_MARKER | PG_SWAPPED))
401                         continue;
402                 if (vm_page_busy_try(m, TRUE))
403                         continue;
404                 vm_page_queues_spin_unlock(marker->queue);
405
406                 if ((object = m->object) == NULL) {
407                         vm_page_wakeup(m);
408                         vm_page_queues_spin_lock(marker->queue);
409                         continue;
410                 }
411                 vm_object_hold(object);
412                 if (m->object != object) {
413                         vm_object_drop(object);
414                         vm_page_wakeup(m);
415                         vm_page_queues_spin_lock(marker->queue);
416                         continue;
417                 }
418                 if (vm_swapcache_test(m)) {
419                         vm_object_drop(object);
420                         vm_page_wakeup(m);
421                         vm_page_queues_spin_lock(marker->queue);
422                         continue;
423                 }
424
425                 vp = object->handle;
426                 if (vp == NULL) {
427                         vm_object_drop(object);
428                         vm_page_wakeup(m);
429                         vm_page_queues_spin_lock(marker->queue);
430                         continue;
431                 }
432
433                 switch(vp->v_type) {
434                 case VREG:
435                         /*
436                          * PG_NOTMETA generically means 'don't swapcache this',
437                          * and HAMMER will set this for regular data buffers
438                          * (and leave it unset for meta-data buffers) as
439                          * appropriate when double buffering is enabled.
440                          */
441                         if (m->flags & PG_NOTMETA) {
442                                 vm_object_drop(object);
443                                 vm_page_wakeup(m);
444                                 vm_page_queues_spin_lock(marker->queue);
445                                 continue;
446                         }
447
448                         /*
449                          * If data_enable is 0 do not try to swapcache data.
450                          * If use_chflags is set then only swapcache data for
451                          * VSWAPCACHE marked vnodes, otherwise any vnode.
452                          */
453                         if (vm_swapcache_data_enable == 0 ||
454                             ((vp->v_flag & VSWAPCACHE) == 0 &&
455                              vm_swapcache_use_chflags)) {
456                                 vm_object_drop(object);
457                                 vm_page_wakeup(m);
458                                 vm_page_queues_spin_lock(marker->queue);
459                                 continue;
460                         }
461                         if (vm_swapcache_maxfilesize &&
462                             object->size >
463                             (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
464                                 vm_object_drop(object);
465                                 vm_page_wakeup(m);
466                                 vm_page_queues_spin_lock(marker->queue);
467                                 continue;
468                         }
469                         isblkdev = 0;
470                         break;
471                 case VCHR:
472                         /*
473                          * PG_NOTMETA generically means 'don't swapcache this',
474                          * and HAMMER will set this for regular data buffers
475                          * (and leave it unset for meta-data buffers) as
476                          * appropriate when double buffering is enabled.
477                          */
478                         if (m->flags & PG_NOTMETA) {
479                                 vm_object_drop(object);
480                                 vm_page_wakeup(m);
481                                 vm_page_queues_spin_lock(marker->queue);
482                                 continue;
483                         }
484                         if (vm_swapcache_meta_enable == 0) {
485                                 vm_object_drop(object);
486                                 vm_page_wakeup(m);
487                                 vm_page_queues_spin_lock(marker->queue);
488                                 continue;
489                         }
490                         isblkdev = 1;
491                         break;
492                 default:
493                         vm_object_drop(object);
494                         vm_page_wakeup(m);
495                         vm_page_queues_spin_lock(marker->queue);
496                         continue;
497                 }
498
499
500                 /*
501                  * Assign swap and initiate I/O.
502                  *
503                  * (adjust for the --count which also occurs in the loop)
504                  */
505                 count -= vm_swapcached_flush(m, isblkdev);
506
507                 /*
508                  * Setup for next loop using marker.
509                  */
510                 vm_object_drop(object);
511                 vm_page_queues_spin_lock(marker->queue);
512         }
513
514         /*
515          * The marker could wind up at the end, which is ok.  If we hit the
516          * end of the list adjust the heuristic.
517          *
518          * Earlier inactive pages that were dirty and become clean
519          * are typically moved to the end of PQ_INACTIVE by virtue
520          * of vfs_vmio_release() when they become unwired from the
521          * buffer cache.
522          */
523         vm_page_queues_spin_unlock(marker->queue);
524
525         /*
526          * m invalid but can be used to test for NULL
527          */
528         return (m == NULL);
529 }
530
531 /*
532  * Flush the specified page using the swap_pager.  The page
533  * must be busied by the caller and its disposition will become
534  * the responsibility of this function.
535  *
536  * Try to collect surrounding pages, including pages which may
537  * have already been assigned swap.  Try to cluster within a
538  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
539  * to match what swap_pager_putpages() can do.
540  *
541  * We also want to try to match against the buffer cache blocksize
542  * but we don't really know what it is here.  Since the buffer cache
543  * wires and unwires pages in groups the fact that we skip wired pages
544  * should be sufficient.
545  *
546  * Returns a count of pages we might have flushed (minimum 1)
547  */
548 static
549 int
550 vm_swapcached_flush(vm_page_t m, int isblkdev)
551 {
552         vm_object_t object;
553         vm_page_t marray[SWAP_META_PAGES];
554         vm_pindex_t basei;
555         int rtvals[SWAP_META_PAGES];
556         int x;
557         int i;
558         int j;
559         int count;
560         int error;
561
562         vm_page_io_start(m);
563         vm_page_protect(m, VM_PROT_READ);
564         object = m->object;
565         vm_object_hold(object);
566
567         /*
568          * Try to cluster around (m), keeping in mind that the swap pager
569          * can only do SMAP_META_PAGES worth of continguous write.
570          */
571         x = (int)m->pindex & SWAP_META_MASK;
572         marray[x] = m;
573         basei = m->pindex;
574         vm_page_wakeup(m);
575
576         for (i = x - 1; i >= 0; --i) {
577                 m = vm_page_lookup_busy_try(object, basei - x + i,
578                                             TRUE, &error);
579                 if (error || m == NULL)
580                         break;
581                 if (vm_swapcache_test(m)) {
582                         vm_page_wakeup(m);
583                         break;
584                 }
585                 if (isblkdev && (m->flags & PG_NOTMETA)) {
586                         vm_page_wakeup(m);
587                         break;
588                 }
589                 vm_page_io_start(m);
590                 vm_page_protect(m, VM_PROT_READ);
591                 if (m->queue - m->pc == PQ_CACHE) {
592                         vm_page_unqueue_nowakeup(m);
593                         vm_page_deactivate(m);
594                 }
595                 marray[i] = m;
596                 vm_page_wakeup(m);
597         }
598         ++i;
599
600         for (j = x + 1; j < SWAP_META_PAGES; ++j) {
601                 m = vm_page_lookup_busy_try(object, basei - x + j,
602                                             TRUE, &error);
603                 if (error || m == NULL)
604                         break;
605                 if (vm_swapcache_test(m)) {
606                         vm_page_wakeup(m);
607                         break;
608                 }
609                 if (isblkdev && (m->flags & PG_NOTMETA)) {
610                         vm_page_wakeup(m);
611                         break;
612                 }
613                 vm_page_io_start(m);
614                 vm_page_protect(m, VM_PROT_READ);
615                 if (m->queue - m->pc == PQ_CACHE) {
616                         vm_page_unqueue_nowakeup(m);
617                         vm_page_deactivate(m);
618                 }
619                 marray[j] = m;
620                 vm_page_wakeup(m);
621         }
622
623         count = j - i;
624         vm_object_pip_add(object, count);
625         swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
626         vm_swapcache_write_count += count * PAGE_SIZE;
627         vm_swapcache_curburst -= count * PAGE_SIZE;
628
629         while (i < j) {
630                 if (rtvals[i] != VM_PAGER_PEND) {
631                         vm_page_busy_wait(marray[i], FALSE, "swppgfd");
632                         vm_page_io_finish(marray[i]);
633                         vm_page_wakeup(marray[i]);
634                         vm_object_pip_wakeup(object);
635                 }
636                 ++i;
637         }
638         vm_object_drop(object);
639         return(count);
640 }
641
642 /*
643  * Test whether a VM page is suitable for writing to the swapcache.
644  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
645  *
646  * Returns 0 on success, 1 on failure
647  */
648 static int
649 vm_swapcache_test(vm_page_t m)
650 {
651         vm_object_t object;
652
653         if (m->flags & PG_UNMANAGED)
654                 return(1);
655         if (m->hold_count || m->wire_count)
656                 return(1);
657         if (m->valid != VM_PAGE_BITS_ALL)
658                 return(1);
659         if (m->dirty & m->valid)
660                 return(1);
661         if ((object = m->object) == NULL)
662                 return(1);
663         if (object->type != OBJT_VNODE ||
664             (object->flags & OBJ_DEAD)) {
665                 return(1);
666         }
667         vm_page_test_dirty(m);
668         if (m->dirty & m->valid)
669                 return(1);
670         return(0);
671 }
672
673 /*
674  * Cleaning pass.
675  *
676  * We clean whole objects up to 16MB
677  */
678 static
679 void
680 vm_swapcache_cleaning(vm_object_t marker, int *swindexp)
681 {
682         vm_object_t object;
683         struct vnode *vp;
684         int count;
685         int scount;
686         int n;
687
688         count = vm_swapcache_maxlaunder;
689         scount = vm_swapcache_maxscan;
690
691         /*
692          * Look for vnode objects
693          */
694         lwkt_gettoken(&vmobj_tokens[*swindexp]);
695
696 outerloop:
697         while ((object = TAILQ_NEXT(marker, object_list)) != NULL) {
698                 /*
699                  * We have to skip markers.  We cannot hold/drop marker
700                  * objects!
701                  */
702                 if (object->type == OBJT_MARKER) {
703                         vm_swapcache_movemarker(marker, *swindexp, object);
704                         continue;
705                 }
706
707                 /*
708                  * Safety, or in case there are millions of VM objects
709                  * without swapcache backing.
710                  */
711                 if (--scount <= 0)
712                         goto breakout;
713
714                 /*
715                  * We must hold the object before potentially yielding.
716                  */
717                 vm_object_hold(object);
718                 lwkt_yield();
719
720                 /* 
721                  * Only operate on live VNODE objects that are either
722                  * VREG or VCHR (VCHR for meta-data).
723                  */
724                 if ((object->type != OBJT_VNODE) ||
725                     ((object->flags & OBJ_DEAD) ||
726                      object->swblock_count == 0) ||
727                     ((vp = object->handle) == NULL) ||
728                     (vp->v_type != VREG && vp->v_type != VCHR)) {
729                         vm_object_drop(object);
730                         /* object may be invalid now */
731                         vm_swapcache_movemarker(marker, *swindexp, object);
732                         continue;
733                 }
734
735                 /*
736                  * Reset the object pindex stored in the marker if the
737                  * working object has changed.
738                  */
739                 if (marker->backing_object != object) {
740                         marker->size = 0;
741                         marker->backing_object_offset = 0;
742                         marker->backing_object = object;
743                 }
744
745                 /*
746                  * Look for swblocks starting at our iterator.
747                  *
748                  * The swap_pager_condfree() function attempts to free
749                  * swap space starting at the specified index.  The index
750                  * will be updated on return.  The function will return
751                  * a scan factor (NOT the number of blocks freed).
752                  *
753                  * If it must cut its scan of the object short due to an
754                  * excessive number of swblocks, or is able to free the
755                  * requested number of blocks, it will return n >= count
756                  * and we break and pick it back up on a future attempt.
757                  *
758                  * Scan the object linearly and try to batch large sets of
759                  * blocks that are likely to clean out entire swap radix
760                  * tree leafs.
761                  */
762                 lwkt_token_swap();
763                 lwkt_reltoken(&vmobj_tokens[*swindexp]);
764
765                 n = swap_pager_condfree(object, &marker->size,
766                                     (count + SWAP_META_MASK) & ~SWAP_META_MASK);
767
768                 vm_object_drop(object);         /* object may be invalid now */
769                 lwkt_gettoken(&vmobj_tokens[*swindexp]);
770
771                 /*
772                  * If we have exhausted the object or deleted our per-pass
773                  * page limit then move us to the next object.  Note that
774                  * the current object may no longer be on the vm_object_list.
775                  */
776                 if (n <= 0 ||
777                     marker->backing_object_offset > vm_swapcache_cleanperobj) {
778                         vm_swapcache_movemarker(marker, *swindexp, object);
779                 }
780
781                 /*
782                  * If we have exhausted our max-launder stop for now.
783                  */
784                 count -= n;
785                 marker->backing_object_offset += n * PAGE_SIZE;
786                 if (count < 0)
787                         goto breakout;
788         }
789
790         /*
791          * Iterate vm_object_lists[] hash table
792          */
793         TAILQ_REMOVE(&vm_object_lists[*swindexp], marker, object_list);
794         lwkt_reltoken(&vmobj_tokens[*swindexp]);
795         if (++*swindexp >= VMOBJ_HSIZE)
796                 *swindexp = 0;
797         lwkt_gettoken(&vmobj_tokens[*swindexp]);
798         TAILQ_INSERT_HEAD(&vm_object_lists[*swindexp], marker, object_list);
799
800         if (*swindexp != 0)
801                 goto outerloop;
802
803 breakout:
804         lwkt_reltoken(&vmobj_tokens[*swindexp]);
805 }
806
807 /*
808  * Move the marker past the current object.  Object can be stale, but we
809  * still need it to determine if the marker has to be moved.  If the object
810  * is still the 'current object' (object after the marker), we hop-scotch
811  * the marker past it.
812  */
813 static void
814 vm_swapcache_movemarker(vm_object_t marker, int swindex, vm_object_t object)
815 {
816         if (TAILQ_NEXT(marker, object_list) == object) {
817                 TAILQ_REMOVE(&vm_object_lists[swindex], marker, object_list);
818                 TAILQ_INSERT_AFTER(&vm_object_lists[swindex], object,
819                                    marker, object_list);
820         }
821 }