kernel - Implement CPU localization hinting for low level page allocations
[dragonfly.git] / sys / vm / vm_zeroidle.c
CommitLineData
bb6811be
MD
1/*
2 * Copyright (c) 1994 John Dyson
dbe02471
MD
3 * Copyright (c) 2001,2016 Matt Dillon
4 * Copyright (c) 2010,2016 The DragonFly Project
bb6811be
MD
5 *
6 * All Rights Reserved.
7 *
8 * This code is derived from software contributed to The DragonFly Project
9 * by Venkatesh Srinivas <me@endeavour.zapto.org>
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
dc71b7ab 19 * 3. Neither the name of the University nor the names of its contributors
bb6811be
MD
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
24 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
27 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
29 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
31 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bb6811be
MD
34 */
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/proc.h>
40#include <sys/vmmeter.h>
41#include <sys/sched.h>
42#include <sys/sysctl.h>
43#include <sys/thread.h>
bb6811be 44#include <sys/kthread.h>
bb6811be
MD
45#include <sys/unistd.h>
46#include <vm/vm.h>
47#include <vm/vm_page.h>
48#include <cpu/lwbuf.h>
49
cd8ab232 50#include <sys/thread2.h>
a86ce0cd 51#include <vm/vm_page2.h>
cd8ab232 52
afd2da4d
MD
53#if 0
54/*
55 * Remove this file in 2017.
56 *
57 * REMOVED - Basically does not provide any performance benefit and instead
58 * appears to cause a performance detriment. I surmise the issue
59 * is simply that it takes such an enormous amount of time to read
60 * data from dynamic ram, what really matters for a page-fault is
61 * not that the page is zerod but instead that its cache is hot.
62 *
63 * Zeroing the page during idle periods means the page is likely
64 * to be cold in the cache when it actually gets used. Zeroing the
65 * page in-line with the VM-fault, on the other-hand, not only
66 * ensures that the memory will be hot in the cache, the zeroing
67 * operation itself does not actually have to read the dynamic ram,
68 * it really only writes into the cache (for a 4K page), so the
69 * page is already hot when the user program then accesses it.
70 */
71
bb6811be
MD
72/*
73 * Implement the pre-zeroed page mechanism.
74 */
bb6811be 75/* Number of bytes to zero between reschedule checks */
8787825a 76#define IDLEZERO_RUN (64)
bb6811be
MD
77
78/* Maximum number of pages per second to zero */
79#define NPAGES_RUN (20000)
80
e6b9120d 81static int idlezero_enable = 1;
bb6811be
MD
82TUNABLE_INT("vm.idlezero_enable", &idlezero_enable);
83SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0,
84 "Allow the kernel to use idle CPU cycles to zero pages");
85static int idlezero_rate = NPAGES_RUN;
86SYSCTL_INT(_vm, OID_AUTO, idlezero_rate, CTLFLAG_RW, &idlezero_rate, 0,
87 "Maximum pages per second to zero");
d87e79b9 88static int idlezero_nocache = -1;
bb6811be
MD
89SYSCTL_INT(_vm, OID_AUTO, idlezero_nocache, CTLFLAG_RW, &idlezero_nocache, 0,
90 "Maximum pages per second to zero");
91
9002b0d5
MD
92static ulong idlezero_count = 0;
93SYSCTL_ULONG(_vm, OID_AUTO, idlezero_count, CTLFLAG_RD, &idlezero_count, 0,
bb6811be
MD
94 "The number of physical pages prezeroed at idle time");
95
96enum zeroidle_state {
97 STATE_IDLE,
98 STATE_GET_PAGE,
99 STATE_ZERO_PAGE,
100 STATE_RELEASE_PAGE
101};
102
62b382d3
VS
103#define DEFAULT_SLEEP_TIME (hz / 10)
104#define LONG_SLEEP_TIME (hz * 10)
105
bb6811be
MD
106/*
107 * Attempt to maintain approximately 1/2 of our free pages in a
108 * PG_ZERO'd state. Add some hysteresis to (attempt to) avoid
109 * generally zeroing a page when the system is near steady-state.
110 * Otherwise we might get 'flutter' during disk I/O / IPC or
111 * fast sleeps. We also do not want to be continuously zeroing
112 * pages because doing so may flush our L1 and L2 caches too much.
a863c5ec
MD
113 *
114 * Returns non-zero if pages should be zerod.
bb6811be
MD
115 */
116static int
bca42d4f 117vm_page_zero_check(int *zero_countp, int *zero_statep)
bb6811be 118{
9002b0d5
MD
119 int base;
120 int count;
121 int nz;
122 int nt;
bca42d4f
MD
123 int i;
124
125 *zero_countp = 0;
bb6811be
MD
126 if (idlezero_enable == 0)
127 return (0);
9002b0d5 128
070a58b3 129 base = vm_get_pg_color(mycpu->gd_cpuid, NULL, 0) & PQ_L2_MASK;
9002b0d5
MD
130 count = 16;
131 while (count < PQ_L2_SIZE / ncpus)
132 count <<= 1;
133 if (base + count > PQ_L2_SIZE)
134 count = PQ_L2_SIZE - base;
135
136 for (i = nt = nz = 0; i < count; ++i) {
137 struct vpgqueues *vpq = &vm_page_queues[PQ_FREE + base + i];
138 nz += vpq->zero_count;
139 nt += vpq->lcnt;
140 }
141
142 if (nt > 10) {
143 *zero_countp = nz * 100 / nt;
144 } else {
145 *zero_countp = 100;
bca42d4f
MD
146 }
147 if (*zero_statep == 0) {
a863c5ec
MD
148 /*
149 * Wait for the count to fall to LO before starting
150 * to zero pages.
151 */
9002b0d5 152 if (*zero_countp <= 50)
bca42d4f 153 *zero_statep = 1;
a863c5ec
MD
154 } else {
155 /*
156 * Once we are zeroing pages wait for the count to
157 * increase to HI before we stop zeroing pages.
158 */
9002b0d5 159 if (*zero_countp >= 90)
bca42d4f 160 *zero_statep = 0;
a863c5ec 161 }
bca42d4f 162 return (*zero_statep);
bb6811be
MD
163}
164
62b382d3
VS
165/*
166 * vm_pagezero should sleep for a longer time when idlezero is disabled or
167 * when there is an excess of zeroed pages.
168 */
169static int
bca42d4f 170vm_page_zero_time(int zero_count)
62b382d3
VS
171{
172 if (idlezero_enable == 0)
173 return (LONG_SLEEP_TIME);
9002b0d5 174 if (zero_count >= 90)
62b382d3
VS
175 return (LONG_SLEEP_TIME);
176 return (DEFAULT_SLEEP_TIME);
177}
178
cd8ab232
MD
179/*
180 * MPSAFE thread
181 */
bb6811be 182static void
bca42d4f 183vm_pagezero(void *arg)
bb6811be
MD
184{
185 vm_page_t m = NULL;
7a683a24
MD
186 struct lwbuf *lwb = NULL;
187 struct lwbuf lwb_cache;
bb6811be
MD
188 enum zeroidle_state state = STATE_IDLE;
189 char *pg = NULL;
190 int npages = 0;
62b382d3 191 int sleep_time;
bb6811be 192 int i = 0;
bca42d4f
MD
193 int cpu = (int)(intptr_t)arg;
194 int zero_state = 0;
bb6811be
MD
195
196 /*
197 * Adjust thread parameters before entering our loop. The thread
198 * is started with the MP lock held and with normal kernel thread
199 * priority.
200 *
201 * Also put us on the last cpu for now.
d2783775
MD
202 *
203 * For now leave the MP lock held, the VM routines cannot be called
204 * with it released until tokenization is finished.
bb6811be 205 */
bb6811be 206 lwkt_setpri_self(TDPRI_IDLE_WORK);
bca42d4f 207 lwkt_setcpu_self(globaldata_find(cpu));
62b382d3 208 sleep_time = DEFAULT_SLEEP_TIME;
bb6811be
MD
209
210 /*
211 * Loop forever
212 */
213 for (;;) {
bca42d4f
MD
214 int zero_count;
215
bb6811be
MD
216 switch(state) {
217 case STATE_IDLE:
218 /*
219 * Wait for work.
220 */
62b382d3 221 tsleep(&zero_state, 0, "pgzero", sleep_time);
bca42d4f 222 if (vm_page_zero_check(&zero_count, &zero_state))
bb6811be 223 npages = idlezero_rate / 10;
bca42d4f 224 sleep_time = vm_page_zero_time(zero_count);
bb6811be
MD
225 if (npages)
226 state = STATE_GET_PAGE; /* Fallthrough */
227 break;
228 case STATE_GET_PAGE:
229 /*
230 * Acquire page to zero
231 */
c5c91ee6 232 if (--npages == 0) {
bb6811be 233 state = STATE_IDLE;
bb6811be
MD
234 } else {
235 m = vm_page_free_fromq_fast();
236 if (m == NULL) {
237 state = STATE_IDLE;
238 } else {
239 state = STATE_ZERO_PAGE;
7a683a24
MD
240 lwb = lwbuf_alloc(m, &lwb_cache);
241 pg = (char *)lwbuf_kva(lwb);
bb6811be
MD
242 i = 0;
243 }
bb6811be
MD
244 }
245 break;
246 case STATE_ZERO_PAGE:
247 /*
8787825a 248 * Zero-out the page
bb6811be
MD
249 */
250 while (i < PAGE_SIZE) {
bb6811be
MD
251 if (idlezero_nocache == 1)
252 bzeront(&pg[i], IDLEZERO_RUN);
253 else
254 bzero(&pg[i], IDLEZERO_RUN);
255 i += IDLEZERO_RUN;
8787825a 256 lwkt_yield();
bb6811be 257 }
8787825a 258 state = STATE_RELEASE_PAGE;
bb6811be
MD
259 break;
260 case STATE_RELEASE_PAGE:
7a683a24 261 lwbuf_free(lwb);
c5c91ee6
VS
262 vm_page_flag_set(m, PG_ZERO);
263 vm_page_free_toq(m);
264 state = STATE_GET_PAGE;
9002b0d5 265 ++idlezero_count; /* non-locked, SMP race ok */
bb6811be
MD
266 break;
267 }
f9235b6d 268 lwkt_yield();
bb6811be
MD
269 }
270}
271
272static void
273pagezero_start(void __unused *arg)
274{
bb6811be 275 struct thread *td;
bca42d4f 276 int i;
bb6811be 277
d87e79b9
MD
278 if (idlezero_nocache < 0 && (cpu_mi_feature & CPU_MI_BZERONT))
279 idlezero_nocache = 1;
5e9b48f4 280
bca42d4f
MD
281 for (i = 0; i < ncpus; ++i) {
282 kthread_create(vm_pagezero, (void *)(intptr_t)i,
283 &td, "pagezero %d", i);
284 }
bb6811be
MD
285}
286
287SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL);
afd2da4d
MD
288
289#endif