polling(4)'s poll/pollmore netmsg handler will not be changed, so initialize
[dragonfly.git] / sys / kern / kern_poll.c
CommitLineData
984263bc
MD
1/*-
2 * Copyright (c) 2001-2002 Luigi Rizzo
3 *
4 * Supported by: the Xorp Project (www.xorp.org)
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: src/sys/kern/kern_poll.c,v 1.2.2.4 2002/06/27 23:26:33 luigi Exp $
553d42f9 28 * $DragonFly: src/sys/kern/kern_poll.c,v 1.47 2008/09/23 14:14:20 sephe Exp $
984263bc
MD
29 */
30
2b71c8f1
SZ
31#include "opt_polling.h"
32
984263bc 33#include <sys/param.h>
984263bc 34#include <sys/kernel.h>
d16c94f7 35#include <sys/ktr.h>
984263bc
MD
36#include <sys/socket.h> /* needed by net/if.h */
37#include <sys/sysctl.h>
38
df4596e1
JH
39#include <sys/thread2.h>
40#include <sys/msgport2.h>
41
984263bc 42#include <net/if.h> /* for IFF_* flags */
1630efc5 43#include <net/netmsg2.h>
984263bc 44
984263bc
MD
45/*
46 * Polling support for [network] device drivers.
47 *
48 * Drivers which support this feature try to register with the
49 * polling code.
50 *
51 * If registration is successful, the driver must disable interrupts,
52 * and further I/O is performed through the handler, which is invoked
53 * (at least once per clock tick) with 3 arguments: the "arg" passed at
54 * register time (a struct ifnet pointer), a command, and a "count" limit.
55 *
56 * The command can be one of the following:
57 * POLL_ONLY: quick move of "count" packets from input/output queues.
58 * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
59 * other more expensive operations. This command is issued periodically
60 * but less frequently than POLL_ONLY.
61 * POLL_DEREGISTER: deregister and return to interrupt mode.
9c095379 62 * POLL_REGISTER: register and disable interrupts
984263bc
MD
63 *
64 * The first two commands are only issued if the interface is marked as
1736eecd 65 * 'IFF_UP, IFF_RUNNING and IFF_POLLING', the last two only if IFF_RUNNING
55b970a5 66 * is set.
984263bc
MD
67 *
68 * The count limit specifies how much work the handler can do during the
69 * call -- typically this is the number of packets to be received, or
70 * transmitted, etc. (drivers are free to interpret this number, as long
71 * as the max time spent in the function grows roughly linearly with the
72 * count).
73 *
74 * Deregistration can be requested by the driver itself (typically in the
75 * *_stop() routine), or by the polling code, by invoking the handler.
76 *
a7a12d73
SZ
77 * Polling can be enabled or disabled on particular CPU_X with the sysctl
78 * variable kern.polling.X.enable (default is 1, enabled)
984263bc
MD
79 *
80 * A second variable controls the sharing of CPU between polling/kernel
81 * network processing, and other activities (typically userlevel tasks):
a7a12d73 82 * kern.polling.X.user_frac (between 0 and 100, default 50) sets the share
984263bc
MD
83 * of CPU allocated to user tasks. CPU is allocated proportionally to the
84 * shares, by dynamically adjusting the "count" (poll_burst).
85 *
86 * Other parameters can should be left to their default values.
87 * The following constraints hold
88 *
a76f9acb
SZ
89 * 1 <= poll_burst <= poll_burst_max
90 * 1 <= poll_each_burst <= poll_burst_max
984263bc
MD
91 * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
92 */
93
94#define MIN_POLL_BURST_MAX 10
95#define MAX_POLL_BURST_MAX 1000
37bca80f 96#define POLL_BURST_MAX 150 /* good for 100Mbit net and HZ=1000 */
ae8add5b 97#define POLL_EACH_BURST 5
984263bc 98
3e61f60e
MD
99#ifndef DEVICE_POLLING_FREQ_MAX
100#define DEVICE_POLLING_FREQ_MAX 30000
101#endif
102#define DEVICE_POLLING_FREQ_DEFAULT 2000
103
94ebffcd
SZ
104#define POLL_LIST_LEN 128
105struct pollrec {
106 struct ifnet *ifp;
107};
984263bc 108
94ebffcd
SZ
109#define POLLCTX_MAX 32
110
111struct pollctx {
112 struct sysctl_ctx_list poll_sysctl_ctx;
113 struct sysctl_oid *poll_sysctl_tree;
114
0eb141f0
SZ
115 uint32_t poll_burst; /* state */
116 uint32_t poll_each_burst; /* tunable */
117 uint32_t poll_burst_max; /* tunable */
118 uint32_t user_frac; /* tunable */
119 int reg_frac_count; /* state */
120 uint32_t reg_frac; /* tunable */
121 uint32_t short_ticks; /* statistics */
122 uint32_t lost_polls; /* statistics */
123 uint32_t pending_polls; /* state */
124 int residual_burst; /* state */
125 uint32_t phase; /* state */
126 uint32_t suspect; /* statistics */
127 uint32_t stalled; /* statistics */
128 struct timeval poll_start_t; /* state */
129 struct timeval prev_t; /* state */
94ebffcd
SZ
130
131 uint32_t poll_handlers; /* next free entry in pr[]. */
132 struct pollrec pr[POLL_LIST_LEN];
133
134 int poll_cpuid;
135 struct systimer pollclock;
0eb141f0
SZ
136 int polling_enabled; /* tunable */
137 int pollhz; /* tunable */
1630efc5
SZ
138
139 struct netmsg poll_netmsg;
140 struct netmsg poll_more_netmsg;
94ebffcd 141};
984263bc 142
94ebffcd 143static struct pollctx *poll_context[POLLCTX_MAX];
984263bc 144
94ebffcd
SZ
145SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
146 "Device polling parameters");
984263bc 147
94ebffcd
SZ
148static int poll_defcpu = -1;
149SYSCTL_INT(_kern_polling, OID_AUTO, defcpu, CTLFLAG_RD,
1630efc5 150 &poll_defcpu, 0, "default CPU to run device polling");
984263bc 151
1630efc5 152static uint32_t poll_cpumask0 = 0xffffffff;
a7a12d73 153TUNABLE_INT("kern.polling.cpumask", (int *)&poll_cpumask0);
984263bc 154
1630efc5
SZ
155static uint32_t poll_cpumask;
156SYSCTL_INT(_kern_polling, OID_AUTO, cpumask, CTLFLAG_RD,
157 &poll_cpumask, 0, "CPUs that can run device polling");
158
159static int polling_enabled = 1; /* global polling enable */
320d152e 160TUNABLE_INT("kern.polling.enable", &polling_enabled);
984263bc 161
94ebffcd
SZ
162static int pollhz = DEVICE_POLLING_FREQ_DEFAULT;
163TUNABLE_INT("kern.polling.pollhz", &pollhz);
984263bc 164
37bca80f
SZ
165static int poll_burst_max = POLL_BURST_MAX;
166TUNABLE_INT("kern.polling.burst_max", &poll_burst_max);
167
ae8add5b
SZ
168static int poll_each_burst = POLL_EACH_BURST;
169TUNABLE_INT("kern.polling.each_burst", &poll_each_burst);
170
a7a12d73 171/* Netisr handlers */
94ebffcd
SZ
172static void netisr_poll(struct netmsg *);
173static void netisr_pollmore(struct netmsg *);
1630efc5
SZ
174static void poll_register(struct netmsg *);
175static void poll_deregister(struct netmsg *);
176static void poll_sysctl_pollhz(struct netmsg *);
177static void poll_sysctl_polling(struct netmsg *);
8a43da2b 178static void poll_sysctl_regfrac(struct netmsg *);
b372084d 179static void poll_sysctl_burstmax(struct netmsg *);
e79f3356 180static void poll_sysctl_eachburst(struct netmsg *);
1630efc5 181
94ebffcd
SZ
182/* Systimer handler */
183static void pollclock(systimer_t, struct intrframe *);
984263bc 184
94ebffcd
SZ
185/* Sysctl handlers */
186static int sysctl_pollhz(SYSCTL_HANDLER_ARGS);
187static int sysctl_polling(SYSCTL_HANDLER_ARGS);
8a43da2b 188static int sysctl_regfrac(SYSCTL_HANDLER_ARGS);
b372084d 189static int sysctl_burstmax(SYSCTL_HANDLER_ARGS);
e79f3356 190static int sysctl_eachburst(SYSCTL_HANDLER_ARGS);
94ebffcd
SZ
191static void poll_add_sysctl(struct sysctl_ctx_list *,
192 struct sysctl_oid_list *, struct pollctx *);
b7572f04 193
553d42f9 194static void schedpoll_oncpu(struct netmsg *);
1630efc5 195
94ebffcd 196void init_device_poll_pcpu(int); /* per-cpu init routine */
984263bc 197
d16c94f7
SZ
198#define POLL_KTR_STRING "ifp=%p"
199#define POLL_KTR_ARG_SIZE (sizeof(void *))
200
201#ifndef KTR_POLLING
202#define KTR_POLLING KTR_ALL
203#endif
204KTR_INFO_MASTER(poll);
205KTR_INFO(KTR_POLLING, poll, beg, 0, POLL_KTR_STRING, POLL_KTR_ARG_SIZE);
206KTR_INFO(KTR_POLLING, poll, end, 1, POLL_KTR_STRING, POLL_KTR_ARG_SIZE);
207
208#define logpoll(name, arg) KTR_LOG(poll_ ## name, arg)
209
0eb141f0
SZ
210static __inline void
211poll_reset_state(struct pollctx *pctx)
212{
6f96e50e 213 crit_enter();
0eb141f0
SZ
214 pctx->poll_burst = 5;
215 pctx->reg_frac_count = 0;
216 pctx->pending_polls = 0;
217 pctx->residual_burst = 0;
218 pctx->phase = 0;
219 bzero(&pctx->poll_start_t, sizeof(pctx->poll_start_t));
220 bzero(&pctx->prev_t, sizeof(pctx->prev_t));
6f96e50e 221 crit_exit();
0eb141f0
SZ
222}
223
984263bc 224/*
1630efc5 225 * Initialize per-cpu polling(4) context. Called from kern_clock.c:
984263bc 226 */
94ebffcd
SZ
227void
228init_device_poll_pcpu(int cpuid)
229{
230 struct pollctx *pctx;
231 char cpuid_str[3];
232
1630efc5
SZ
233 if (cpuid >= POLLCTX_MAX)
234 return;
235
236 if (((1 << cpuid) & poll_cpumask0) == 0)
94ebffcd
SZ
237 return;
238
37bca80f
SZ
239 if (poll_burst_max < MIN_POLL_BURST_MAX)
240 poll_burst_max = MIN_POLL_BURST_MAX;
241 else if (poll_burst_max > MAX_POLL_BURST_MAX)
242 poll_burst_max = MAX_POLL_BURST_MAX;
243
ae8add5b
SZ
244 if (poll_each_burst > poll_burst_max)
245 poll_each_burst = poll_burst_max;
246
1630efc5
SZ
247 poll_cpumask |= (1 << cpuid);
248
94ebffcd
SZ
249 pctx = kmalloc(sizeof(*pctx), M_DEVBUF, M_WAITOK | M_ZERO);
250
ae8add5b 251 pctx->poll_each_burst = poll_each_burst;
37bca80f 252 pctx->poll_burst_max = poll_burst_max;
94ebffcd
SZ
253 pctx->user_frac = 50;
254 pctx->reg_frac = 20;
255 pctx->polling_enabled = polling_enabled;
256 pctx->pollhz = pollhz;
257 pctx->poll_cpuid = cpuid;
0eb141f0 258 poll_reset_state(pctx);
94ebffcd 259
553d42f9
SZ
260 netmsg_init(&pctx->poll_netmsg, &netisr_adone_rport, 0,
261 netisr_poll);
262#ifdef INVARIANTS
263 pctx->poll_netmsg.nm_lmsg.u.ms_resultp = pctx;
264#endif
265
266 netmsg_init(&pctx->poll_more_netmsg, &netisr_adone_rport, 0,
267 netisr_pollmore);
268#ifdef INVARIANTS
269 pctx->poll_more_netmsg.nm_lmsg.u.ms_resultp = pctx;
270#endif
271
94ebffcd
SZ
272 KASSERT(cpuid < POLLCTX_MAX, ("cpu id must < %d", cpuid));
273 poll_context[cpuid] = pctx;
274
275 if (poll_defcpu < 0) {
276 poll_defcpu = cpuid;
277
278 /*
279 * Initialize global sysctl nodes, for compat
280 */
281 poll_add_sysctl(NULL, SYSCTL_STATIC_CHILDREN(_kern_polling),
282 pctx);
283 }
284
285 /*
286 * Initialize per-cpu sysctl nodes
287 */
288 ksnprintf(cpuid_str, sizeof(cpuid_str), "%d", pctx->poll_cpuid);
289
290 sysctl_ctx_init(&pctx->poll_sysctl_ctx);
291 pctx->poll_sysctl_tree = SYSCTL_ADD_NODE(&pctx->poll_sysctl_ctx,
292 SYSCTL_STATIC_CHILDREN(_kern_polling),
293 OID_AUTO, cpuid_str, CTLFLAG_RD, 0, "");
294 poll_add_sysctl(&pctx->poll_sysctl_ctx,
295 SYSCTL_CHILDREN(pctx->poll_sysctl_tree), pctx);
296
297 /*
298 * Initialize systimer
299 */
1630efc5
SZ
300 systimer_init_periodic_nq(&pctx->pollclock, pollclock, pctx, 1);
301}
302
553d42f9
SZ
303static void
304schedpoll_oncpu(struct netmsg *msg)
305{
306 if (msg->nm_lmsg.ms_flags & MSGF_DONE)
307 lwkt_sendmsg(cpu_portfn(mycpuid), &msg->nm_lmsg);
308}
309
1630efc5
SZ
310static __inline void
311schedpoll(struct pollctx *pctx)
312{
6f96e50e 313 crit_enter();
553d42f9 314 schedpoll_oncpu(&pctx->poll_netmsg);
6f96e50e 315 crit_exit();
1630efc5
SZ
316}
317
318static __inline void
319schedpollmore(struct pollctx *pctx)
320{
553d42f9 321 schedpoll_oncpu(&pctx->poll_more_netmsg);
3e61f60e
MD
322}
323
324/*
325 * Set the polling frequency
326 */
327static int
328sysctl_pollhz(SYSCTL_HANDLER_ARGS)
329{
94ebffcd 330 struct pollctx *pctx = arg1;
1630efc5
SZ
331 struct netmsg msg;
332 lwkt_port_t port;
3e61f60e
MD
333 int error, phz;
334
94ebffcd 335 phz = pctx->pollhz;
3e61f60e
MD
336 error = sysctl_handle_int(oidp, &phz, 0, req);
337 if (error || req->newptr == NULL)
338 return error;
339 if (phz <= 0)
340 return EINVAL;
341 else if (phz > DEVICE_POLLING_FREQ_MAX)
342 phz = DEVICE_POLLING_FREQ_MAX;
343
1630efc5
SZ
344 netmsg_init(&msg, &curthread->td_msgport, 0, poll_sysctl_pollhz);
345 msg.nm_lmsg.u.ms_result = phz;
346
347 port = cpu_portfn(pctx->poll_cpuid);
348 lwkt_domsg(port, &msg.nm_lmsg, 0);
3e61f60e
MD
349 return 0;
350}
351
352/*
a7a12d73 353 * Master enable.
3e61f60e
MD
354 */
355static int
356sysctl_polling(SYSCTL_HANDLER_ARGS)
357{
94ebffcd 358 struct pollctx *pctx = arg1;
1630efc5
SZ
359 struct netmsg msg;
360 lwkt_port_t port;
3e61f60e
MD
361 int error, enabled;
362
94ebffcd 363 enabled = pctx->polling_enabled;
3e61f60e
MD
364 error = sysctl_handle_int(oidp, &enabled, 0, req);
365 if (error || req->newptr == NULL)
366 return error;
94ebffcd 367
1630efc5
SZ
368 netmsg_init(&msg, &curthread->td_msgport, 0, poll_sysctl_polling);
369 msg.nm_lmsg.u.ms_result = enabled;
370
371 port = cpu_portfn(pctx->poll_cpuid);
372 lwkt_domsg(port, &msg.nm_lmsg, 0);
3e61f60e 373 return 0;
984263bc
MD
374}
375
8a43da2b
SZ
376static int
377sysctl_regfrac(SYSCTL_HANDLER_ARGS)
378{
379 struct pollctx *pctx = arg1;
380 struct netmsg msg;
381 lwkt_port_t port;
382 uint32_t reg_frac;
383 int error;
384
385 reg_frac = pctx->reg_frac;
386 error = sysctl_handle_int(oidp, &reg_frac, 0, req);
387 if (error || req->newptr == NULL)
388 return error;
389
390 netmsg_init(&msg, &curthread->td_msgport, 0, poll_sysctl_regfrac);
391 msg.nm_lmsg.u.ms_result = reg_frac;
392
393 port = cpu_portfn(pctx->poll_cpuid);
394 lwkt_domsg(port, &msg.nm_lmsg, 0);
395 return 0;
396}
397
b372084d
SZ
398static int
399sysctl_burstmax(SYSCTL_HANDLER_ARGS)
400{
401 struct pollctx *pctx = arg1;
402 struct netmsg msg;
403 lwkt_port_t port;
404 uint32_t burst_max;
405 int error;
406
407 burst_max = pctx->poll_burst_max;
408 error = sysctl_handle_int(oidp, &burst_max, 0, req);
409 if (error || req->newptr == NULL)
410 return error;
411 if (burst_max < MIN_POLL_BURST_MAX)
412 burst_max = MIN_POLL_BURST_MAX;
413 else if (burst_max > MAX_POLL_BURST_MAX)
414 burst_max = MAX_POLL_BURST_MAX;
415
416 netmsg_init(&msg, &curthread->td_msgport, 0, poll_sysctl_burstmax);
417 msg.nm_lmsg.u.ms_result = burst_max;
418
419 port = cpu_portfn(pctx->poll_cpuid);
420 lwkt_domsg(port, &msg.nm_lmsg, 0);
421 return 0;
422}
423
e79f3356
SZ
424static int
425sysctl_eachburst(SYSCTL_HANDLER_ARGS)
426{
427 struct pollctx *pctx = arg1;
428 struct netmsg msg;
429 lwkt_port_t port;
430 uint32_t each_burst;
431 int error;
432
433 each_burst = pctx->poll_each_burst;
434 error = sysctl_handle_int(oidp, &each_burst, 0, req);
435 if (error || req->newptr == NULL)
436 return error;
437
438 netmsg_init(&msg, &curthread->td_msgport, 0, poll_sysctl_eachburst);
439 msg.nm_lmsg.u.ms_result = each_burst;
440
441 port = cpu_portfn(pctx->poll_cpuid);
442 lwkt_domsg(port, &msg.nm_lmsg, 0);
443 return 0;
444}
445
984263bc 446/*
a7a12d73
SZ
447 * Hook from polling systimer. Tries to schedule a netisr, but keeps
448 * track of lost ticks due to the previous handler taking too long.
984263bc
MD
449 * Normally, this should not happen, because polling handler should
450 * run for a short time. However, in some cases (e.g. when there are
451 * changes in link status etc.) the drivers take a very long time
452 * (even in the order of milliseconds) to reset and reconfigure the
453 * device, causing apparent lost polls.
454 *
455 * The first part of the code is just for debugging purposes, and tries
456 * to count how often hardclock ticks are shorter than they should,
457 * meaning either stray interrupts or delayed events.
88c4d2f6
MD
458 *
459 * WARNING! called from fastint or IPI, the MP lock might not be held.
984263bc 460 */
3e61f60e 461static void
94ebffcd 462pollclock(systimer_t info, struct intrframe *frame __unused)
984263bc 463{
94ebffcd
SZ
464 struct pollctx *pctx = info->data;
465 struct timeval t;
984263bc
MD
466 int delta;
467
94ebffcd 468 if (pctx->poll_handlers == 0)
984263bc
MD
469 return;
470
471 microuptime(&t);
94ebffcd
SZ
472 delta = (t.tv_usec - pctx->prev_t.tv_usec) +
473 (t.tv_sec - pctx->prev_t.tv_sec)*1000000;
fe7674e3 474 if (delta * pctx->pollhz < 500000)
94ebffcd 475 pctx->short_ticks++;
984263bc 476 else
94ebffcd 477 pctx->prev_t = t;
984263bc 478
94ebffcd 479 if (pctx->pending_polls > 100) {
984263bc
MD
480 /*
481 * Too much, assume it has stalled (not always true
482 * see comment above).
483 */
94ebffcd
SZ
484 pctx->stalled++;
485 pctx->pending_polls = 0;
486 pctx->phase = 0;
984263bc
MD
487 }
488
94ebffcd
SZ
489 if (pctx->phase <= 2) {
490 if (pctx->phase != 0)
491 pctx->suspect++;
492 pctx->phase = 1;
1630efc5 493 schedpoll(pctx);
94ebffcd 494 pctx->phase = 2;
984263bc 495 }
94ebffcd
SZ
496 if (pctx->pending_polls++ > 0)
497 pctx->lost_polls++;
984263bc
MD
498}
499
984263bc
MD
500/*
501 * netisr_pollmore is called after other netisr's, possibly scheduling
502 * another NETISR_POLL call, or adapting the burst size for the next cycle.
503 *
504 * It is very bad to fetch large bursts of packets from a single card at once,
5bdc9beb
SZ
505 * because the burst could take a long time to be completely processed leading
506 * to unfairness. To reduce the problem, and also to account better for time
507 * spent in network-related processing, we split the burst in smaller chunks
508 * of fixed size, giving control to the other netisr's between chunks. This
509 * helps in improving the fairness, reducing livelock (because we emulate more
510 * closely the "process to completion" that we have with fastforwarding) and
511 * accounting for the work performed in low level handling and forwarding.
984263bc
MD
512 */
513
bf82f9b7 514/* ARGSUSED */
4599cf19 515static void
df4596e1 516netisr_pollmore(struct netmsg *msg)
984263bc 517{
94ebffcd 518 struct pollctx *pctx;
984263bc 519 struct timeval t;
94ebffcd 520 int kern_load, cpuid;
6f96e50e 521 uint32_t pending_polls;
94ebffcd
SZ
522
523 cpuid = mycpu->gd_cpuid;
524 KKASSERT(cpuid < POLLCTX_MAX);
525
526 pctx = poll_context[cpuid];
527 KKASSERT(pctx != NULL);
528 KKASSERT(pctx->poll_cpuid == cpuid);
1630efc5 529 KKASSERT(pctx == msg->nm_lmsg.u.ms_resultp);
984263bc 530
a29576fc 531 lwkt_replymsg(&msg->nm_lmsg, 0);
1630efc5 532
0eb141f0
SZ
533 if (pctx->poll_handlers == 0)
534 return;
535
536 KASSERT(pctx->polling_enabled,
537 ("# of registered poll handlers are not zero, "
538 "but polling is not enabled\n"));
539
94ebffcd
SZ
540 pctx->phase = 5;
541 if (pctx->residual_burst > 0) {
1630efc5 542 schedpoll(pctx);
984263bc 543 /* will run immediately on return, followed by netisrs */
1630efc5 544 return;
984263bc
MD
545 }
546 /* here we can account time spent in netisr's in this tick */
547 microuptime(&t);
94ebffcd
SZ
548 kern_load = (t.tv_usec - pctx->poll_start_t.tv_usec) +
549 (t.tv_sec - pctx->poll_start_t.tv_sec)*1000000; /* us */
fe7674e3 550 kern_load = (kern_load * pctx->pollhz) / 10000; /* 0..100 */
94ebffcd
SZ
551 if (kern_load > (100 - pctx->user_frac)) { /* try decrease ticks */
552 if (pctx->poll_burst > 1)
553 pctx->poll_burst--;
984263bc 554 } else {
94ebffcd
SZ
555 if (pctx->poll_burst < pctx->poll_burst_max)
556 pctx->poll_burst++;
984263bc
MD
557 }
558
6f96e50e 559 crit_enter();
94ebffcd 560 pctx->pending_polls--;
6f96e50e
SZ
561 pending_polls = pctx->pending_polls;
562 crit_exit();
563
564 if (pending_polls == 0) { /* we are done */
94ebffcd 565 pctx->phase = 0;
a29576fc 566 } else {
984263bc
MD
567 /*
568 * Last cycle was long and caused us to miss one or more
569 * hardclock ticks. Restart processing again, but slightly
570 * reduce the burst size to prevent that this happens again.
571 */
94ebffcd
SZ
572 pctx->poll_burst -= (pctx->poll_burst / 8);
573 if (pctx->poll_burst < 1)
574 pctx->poll_burst = 1;
1630efc5 575 schedpoll(pctx);
94ebffcd 576 pctx->phase = 6;
984263bc 577 }
984263bc
MD
578}
579
580/*
a7a12d73
SZ
581 * netisr_poll is scheduled by schedpoll when appropriate, typically once
582 * per polling systimer tick.
a29576fc
MD
583 *
584 * Note that the message is replied immediately in order to allow a new
585 * ISR to be scheduled in the handler.
e43a034f
MD
586 *
587 * XXX each registration should indicate whether it needs a critical
588 * section to operate.
984263bc 589 */
bf82f9b7 590/* ARGSUSED */
4599cf19 591static void
df4596e1 592netisr_poll(struct netmsg *msg)
984263bc 593{
94ebffcd
SZ
594 struct pollctx *pctx;
595 int i, cycles, cpuid;
984263bc 596 enum poll_cmd arg = POLL_ONLY;
984263bc 597
94ebffcd
SZ
598 cpuid = mycpu->gd_cpuid;
599 KKASSERT(cpuid < POLLCTX_MAX);
600
601 pctx = poll_context[cpuid];
602 KKASSERT(pctx != NULL);
603 KKASSERT(pctx->poll_cpuid == cpuid);
1630efc5 604 KKASSERT(pctx == msg->nm_lmsg.u.ms_resultp);
94ebffcd 605
6f96e50e 606 crit_enter();
a29576fc 607 lwkt_replymsg(&msg->nm_lmsg, 0);
6f96e50e 608 crit_exit();
1630efc5 609
0eb141f0
SZ
610 if (pctx->poll_handlers == 0)
611 return;
612
613 KASSERT(pctx->polling_enabled,
614 ("# of registered poll handlers are not zero, "
615 "but polling is not enabled\n"));
616
94ebffcd
SZ
617 pctx->phase = 3;
618 if (pctx->residual_burst == 0) { /* first call in this tick */
619 microuptime(&pctx->poll_start_t);
984263bc 620
94ebffcd 621 if (pctx->reg_frac_count-- == 0) {
984263bc 622 arg = POLL_AND_CHECK_STATUS;
94ebffcd 623 pctx->reg_frac_count = pctx->reg_frac - 1;
984263bc 624 }
984263bc 625
94ebffcd 626 pctx->residual_burst = pctx->poll_burst;
984263bc 627 }
94ebffcd
SZ
628 cycles = (pctx->residual_burst < pctx->poll_each_burst) ?
629 pctx->residual_burst : pctx->poll_each_burst;
630 pctx->residual_burst -= cycles;
984263bc 631
0eb141f0
SZ
632 for (i = 0 ; i < pctx->poll_handlers ; i++) {
633 struct ifnet *ifp = pctx->pr[i].ifp;
55b970a5 634
0eb141f0
SZ
635 if (!lwkt_serialize_try(ifp->if_serializer))
636 continue;
1630efc5 637
0eb141f0 638 if ((ifp->if_flags & (IFF_UP|IFF_RUNNING|IFF_POLLING))
d16c94f7
SZ
639 == (IFF_UP|IFF_RUNNING|IFF_POLLING)) {
640 logpoll(beg, ifp);
641 crit_enter();
0eb141f0 642 ifp->if_poll(ifp, arg, cycles);
d16c94f7
SZ
643 crit_exit();
644 logpoll(end, ifp);
645 }
1630efc5 646
0eb141f0 647 lwkt_serialize_exit(ifp->if_serializer);
984263bc 648 }
0eb141f0 649
1630efc5 650 schedpollmore(pctx);
94ebffcd 651 pctx->phase = 4;
1630efc5
SZ
652}
653
654static void
655poll_register(struct netmsg *msg)
656{
657 struct ifnet *ifp = msg->nm_lmsg.u.ms_resultp;
658 struct pollctx *pctx;
659 int rc, cpuid;
660
661 cpuid = mycpu->gd_cpuid;
662 KKASSERT(cpuid < POLLCTX_MAX);
663
664 pctx = poll_context[cpuid];
665 KKASSERT(pctx != NULL);
666 KKASSERT(pctx->poll_cpuid == cpuid);
667
668 if (pctx->polling_enabled == 0) {
669 /* Polling disabled, cannot register */
670 rc = EOPNOTSUPP;
671 goto back;
672 }
673
674 /*
675 * Check if there is room.
676 */
677 if (pctx->poll_handlers >= POLL_LIST_LEN) {
678 /*
679 * List full, cannot register more entries.
680 * This should never happen; if it does, it is probably a
681 * broken driver trying to register multiple times. Checking
682 * this at runtime is expensive, and won't solve the problem
683 * anyways, so just report a few times and then give up.
684 */
685 static int verbose = 10; /* XXX */
686 if (verbose >0) {
687 kprintf("poll handlers list full, "
688 "maybe a broken driver ?\n");
689 verbose--;
690 }
691 rc = ENOMEM;
692 } else {
693 pctx->pr[pctx->poll_handlers].ifp = ifp;
694 pctx->poll_handlers++;
695 rc = 0;
696
697 if (pctx->poll_handlers == 1) {
698 KKASSERT(pctx->polling_enabled);
699 systimer_adjust_periodic(&pctx->pollclock,
700 pctx->pollhz);
701 }
702 }
703back:
704 lwkt_replymsg(&msg->nm_lmsg, rc);
984263bc
MD
705}
706
707/*
708 * Try to register routine for polling. Returns 1 if successful
709 * (and polling should be enabled), 0 otherwise.
984263bc 710 *
9c095379 711 * Called from mainline code only, not called from an interrupt.
984263bc
MD
712 */
713int
9c095379 714ether_poll_register(struct ifnet *ifp)
984263bc 715{
94ebffcd
SZ
716 if (poll_defcpu < 0)
717 return 0;
718 KKASSERT(poll_defcpu < POLLCTX_MAX);
719
1630efc5
SZ
720 return ether_pollcpu_register(ifp, poll_defcpu);
721}
94ebffcd 722
1630efc5
SZ
723int
724ether_pollcpu_register(struct ifnet *ifp, int cpuid)
725{
726 struct netmsg msg;
727 lwkt_port_t port;
728 int rc;
729
730 if (ifp->if_poll == NULL) {
731 /* Device does not support polling */
984263bc 732 return 0;
1630efc5
SZ
733 }
734
735 if (cpuid < 0 || cpuid >= POLLCTX_MAX)
984263bc 736 return 0;
1630efc5
SZ
737
738 if (((1 << cpuid) & poll_cpumask) == 0) {
739 /* Polling is not supported on 'cpuid' */
9c095379 740 return 0;
1630efc5
SZ
741 }
742 KKASSERT(poll_context[cpuid] != NULL);
9c095379
MD
743
744 /*
745 * Attempt to register. Interlock with IFF_POLLING.
746 */
747 crit_enter(); /* XXX MP - not mp safe */
1630efc5 748
78195a76 749 lwkt_serialize_enter(ifp->if_serializer);
1630efc5
SZ
750 if (ifp->if_flags & IFF_POLLING) {
751 /* Already polling */
752 KKASSERT(ifp->if_poll_cpuid >= 0);
753 lwkt_serialize_exit(ifp->if_serializer);
754 crit_exit();
755 return 0;
756 }
757 KKASSERT(ifp->if_poll_cpuid < 0);
9c095379 758 ifp->if_flags |= IFF_POLLING;
1630efc5 759 ifp->if_poll_cpuid = cpuid;
1736eecd
SZ
760 if (ifp->if_flags & IFF_RUNNING)
761 ifp->if_poll(ifp, POLL_REGISTER, 0);
78195a76 762 lwkt_serialize_exit(ifp->if_serializer);
984263bc 763
1630efc5
SZ
764 netmsg_init(&msg, &curthread->td_msgport, 0, poll_register);
765 msg.nm_lmsg.u.ms_resultp = ifp;
766
767 port = cpu_portfn(cpuid);
768 lwkt_domsg(port, &msg.nm_lmsg, 0);
769
770 if (msg.nm_lmsg.ms_error) {
78195a76 771 lwkt_serialize_enter(ifp->if_serializer);
1736eecd 772 ifp->if_flags &= ~IFF_POLLING;
1630efc5 773 ifp->if_poll_cpuid = -1;
1736eecd
SZ
774 if (ifp->if_flags & IFF_RUNNING)
775 ifp->if_poll(ifp, POLL_DEREGISTER, 0);
78195a76 776 lwkt_serialize_exit(ifp->if_serializer);
b7572f04
JS
777 rc = 0;
778 } else {
b7572f04
JS
779 rc = 1;
780 }
1630efc5 781
9c095379 782 crit_exit();
1630efc5
SZ
783 return rc;
784}
785
786static void
787poll_deregister(struct netmsg *msg)
788{
789 struct ifnet *ifp = msg->nm_lmsg.u.ms_resultp;
790 struct pollctx *pctx;
791 int rc, i, cpuid;
792
793 cpuid = mycpu->gd_cpuid;
794 KKASSERT(cpuid < POLLCTX_MAX);
795
796 pctx = poll_context[cpuid];
797 KKASSERT(pctx != NULL);
798 KKASSERT(pctx->poll_cpuid == cpuid);
799
800 for (i = 0 ; i < pctx->poll_handlers ; i++) {
801 if (pctx->pr[i].ifp == ifp) /* Found it */
802 break;
803 }
804 if (i == pctx->poll_handlers) {
805 kprintf("ether_poll_deregister: ifp not found!!!\n");
806 rc = ENOENT;
807 } else {
808 pctx->poll_handlers--;
809 if (i < pctx->poll_handlers) {
810 /* Last entry replaces this one. */
811 pctx->pr[i].ifp = pctx->pr[pctx->poll_handlers].ifp;
812 }
813
0eb141f0 814 if (pctx->poll_handlers == 0) {
1630efc5 815 systimer_adjust_periodic(&pctx->pollclock, 1);
0eb141f0
SZ
816 poll_reset_state(pctx);
817 }
1630efc5
SZ
818 rc = 0;
819 }
820 lwkt_replymsg(&msg->nm_lmsg, rc);
984263bc
MD
821}
822
823/*
9c095379
MD
824 * Remove interface from the polling list. Occurs when polling is turned
825 * off. Called from mainline code only, not called from an interrupt.
984263bc
MD
826 */
827int
828ether_poll_deregister(struct ifnet *ifp)
829{
1630efc5
SZ
830 struct netmsg msg;
831 lwkt_port_t port;
832 int rc, cpuid;
b7572f04 833
0a39ac0c
SZ
834 KKASSERT(ifp != NULL);
835
1630efc5 836 if (ifp->if_poll == NULL)
94ebffcd 837 return 0;
94ebffcd 838
9c095379 839 crit_enter();
1630efc5
SZ
840
841 lwkt_serialize_enter(ifp->if_serializer);
0a39ac0c 842 if ((ifp->if_flags & IFF_POLLING) == 0) {
1630efc5
SZ
843 KKASSERT(ifp->if_poll_cpuid < 0);
844 lwkt_serialize_exit(ifp->if_serializer);
9c095379 845 crit_exit();
984263bc
MD
846 return 0;
847 }
9c095379 848
1630efc5
SZ
849 cpuid = ifp->if_poll_cpuid;
850 KKASSERT(cpuid >= 0);
851 KKASSERT(poll_context[cpuid] != NULL);
852
853 ifp->if_flags &= ~IFF_POLLING;
854 ifp->if_poll_cpuid = -1;
855 lwkt_serialize_exit(ifp->if_serializer);
856
857 netmsg_init(&msg, &curthread->td_msgport, 0, poll_deregister);
858 msg.nm_lmsg.u.ms_resultp = ifp;
859
860 port = cpu_portfn(cpuid);
861 lwkt_domsg(port, &msg.nm_lmsg, 0);
862
863 if (!msg.nm_lmsg.ms_error) {
78195a76 864 lwkt_serialize_enter(ifp->if_serializer);
1630efc5
SZ
865 if (ifp->if_flags & IFF_RUNNING)
866 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
78195a76 867 lwkt_serialize_exit(ifp->if_serializer);
1630efc5
SZ
868 rc = 1;
869 } else {
870 rc = 0;
78195a76 871 }
1630efc5
SZ
872
873 crit_exit();
874 return rc;
984263bc 875}
94ebffcd
SZ
876
877static void
878poll_add_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent,
879 struct pollctx *pctx)
880{
881 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "enable",
882 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_polling,
883 "I", "Polling enabled");
884
885 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "pollhz",
886 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_pollhz,
887 "I", "Device polling frequency");
888
8a43da2b
SZ
889 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "reg_frac",
890 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_regfrac,
891 "IU", "Every this many cycles poll register");
892
b372084d
SZ
893 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "burst_max",
894 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_burstmax,
895 "IU", "Max Polling burst size");
896
e79f3356
SZ
897 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "each_burst",
898 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_eachburst,
899 "IU", "Max size of each burst");
900
94ebffcd
SZ
901 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "phase", CTLFLAG_RD,
902 &pctx->phase, 0, "Polling phase");
903
904 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "suspect", CTLFLAG_RW,
905 &pctx->suspect, 0, "suspect event");
906
907 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "stalled", CTLFLAG_RW,
908 &pctx->stalled, 0, "potential stalls");
909
a76f9acb 910 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "burst", CTLFLAG_RD,
94ebffcd
SZ
911 &pctx->poll_burst, 0, "Current polling burst size");
912
94ebffcd
SZ
913 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "user_frac", CTLFLAG_RW,
914 &pctx->user_frac, 0,
915 "Desired user fraction of cpu time");
916
94ebffcd
SZ
917 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "short_ticks", CTLFLAG_RW,
918 &pctx->short_ticks, 0,
919 "Hardclock ticks shorter than they should be");
920
921 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "lost_polls", CTLFLAG_RW,
922 &pctx->lost_polls, 0,
923 "How many times we would have lost a poll tick");
924
925 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "pending_polls", CTLFLAG_RD,
926 &pctx->pending_polls, 0, "Do we need to poll again");
927
a76f9acb 928 SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "residual_burst", CTLFLAG_RD,
94ebffcd
SZ
929 &pctx->residual_burst, 0,
930 "# of residual cycles in burst");
931
932 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "handlers", CTLFLAG_RD,
933 &pctx->poll_handlers, 0,
934 "Number of registered poll handlers");
935}
1630efc5 936
1630efc5
SZ
937static void
938poll_sysctl_pollhz(struct netmsg *msg)
939{
940 struct pollctx *pctx;
941 int cpuid;
942
943 cpuid = mycpu->gd_cpuid;
944 KKASSERT(cpuid < POLLCTX_MAX);
945
946 pctx = poll_context[cpuid];
947 KKASSERT(pctx != NULL);
948 KKASSERT(pctx->poll_cpuid == cpuid);
949
a7a12d73
SZ
950 /*
951 * If polling is disabled or there is no device registered,
952 * don't adjust polling systimer frequency.
953 * Polling systimer frequency will be adjusted once polling
954 * is enabled and there are registered devices.
955 */
1630efc5
SZ
956 pctx->pollhz = msg->nm_lmsg.u.ms_result;
957 if (pctx->polling_enabled && pctx->poll_handlers)
958 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
8a43da2b
SZ
959
960 /*
961 * Make sure that reg_frac and reg_frac_count are within valid range.
962 */
963 if (pctx->reg_frac > pctx->pollhz) {
964 pctx->reg_frac = pctx->pollhz;
965 if (pctx->reg_frac_count > pctx->reg_frac)
966 pctx->reg_frac_count = pctx->reg_frac - 1;
967 }
968
1630efc5
SZ
969 lwkt_replymsg(&msg->nm_lmsg, 0);
970}
971
972static void
973poll_sysctl_polling(struct netmsg *msg)
974{
975 struct pollctx *pctx;
976 int cpuid;
977
978 cpuid = mycpu->gd_cpuid;
979 KKASSERT(cpuid < POLLCTX_MAX);
980
981 pctx = poll_context[cpuid];
982 KKASSERT(pctx != NULL);
983 KKASSERT(pctx->poll_cpuid == cpuid);
984
a7a12d73
SZ
985 /*
986 * If polling is disabled or there is no device registered,
987 * cut the polling systimer frequency to 1hz.
988 */
1630efc5 989 pctx->polling_enabled = msg->nm_lmsg.u.ms_result;
0eb141f0 990 if (pctx->polling_enabled && pctx->poll_handlers) {
1630efc5 991 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
0eb141f0 992 } else {
1630efc5 993 systimer_adjust_periodic(&pctx->pollclock, 1);
0eb141f0
SZ
994 poll_reset_state(pctx);
995 }
996
997 if (!pctx->polling_enabled && pctx->poll_handlers != 0) {
998 int i;
999
1000 for (i = 0 ; i < pctx->poll_handlers ; i++) {
1001 struct ifnet *ifp = pctx->pr[i].ifp;
1002
1003 lwkt_serialize_enter(ifp->if_serializer);
1004
1005 if ((ifp->if_flags & IFF_POLLING) == 0) {
1006 KKASSERT(ifp->if_poll_cpuid < 0);
1007 lwkt_serialize_exit(ifp->if_serializer);
1008 continue;
1009 }
1010 ifp->if_flags &= ~IFF_POLLING;
1011 ifp->if_poll_cpuid = -1;
1012
1013 /*
1014 * Only call the interface deregistration
1015 * function if the interface is still
1016 * running.
1017 */
1018 if (ifp->if_flags & IFF_RUNNING)
1019 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
1020
1021 lwkt_serialize_exit(ifp->if_serializer);
1022 }
1023 pctx->poll_handlers = 0;
1024 }
1025
1630efc5
SZ
1026 lwkt_replymsg(&msg->nm_lmsg, 0);
1027}
8a43da2b
SZ
1028
1029static void
1030poll_sysctl_regfrac(struct netmsg *msg)
1031{
1032 struct pollctx *pctx;
1033 uint32_t reg_frac;
1034 int cpuid;
1035
1036 cpuid = mycpu->gd_cpuid;
1037 KKASSERT(cpuid < POLLCTX_MAX);
1038
1039 pctx = poll_context[cpuid];
1040 KKASSERT(pctx != NULL);
1041 KKASSERT(pctx->poll_cpuid == cpuid);
1042
1043 reg_frac = msg->nm_lmsg.u.ms_result;
1044 if (reg_frac > pctx->pollhz)
1045 reg_frac = pctx->pollhz;
1046 else if (reg_frac < 1)
1047 reg_frac = 1;
1048
1049 pctx->reg_frac = reg_frac;
1050 if (pctx->reg_frac_count > pctx->reg_frac)
1051 pctx->reg_frac_count = pctx->reg_frac - 1;
1052
1053 lwkt_replymsg(&msg->nm_lmsg, 0);
1054}
b372084d
SZ
1055
1056static void
1057poll_sysctl_burstmax(struct netmsg *msg)
1058{
1059 struct pollctx *pctx;
1060 int cpuid;
1061
1062 cpuid = mycpu->gd_cpuid;
1063 KKASSERT(cpuid < POLLCTX_MAX);
1064
1065 pctx = poll_context[cpuid];
1066 KKASSERT(pctx != NULL);
1067 KKASSERT(pctx->poll_cpuid == cpuid);
1068
1069 pctx->poll_burst_max = msg->nm_lmsg.u.ms_result;
1070 if (pctx->poll_each_burst > pctx->poll_burst_max)
1071 pctx->poll_each_burst = pctx->poll_burst_max;
1072 if (pctx->poll_burst > pctx->poll_burst_max)
1073 pctx->poll_burst = pctx->poll_burst_max;
1074 if (pctx->residual_burst > pctx->poll_burst_max)
1075 pctx->residual_burst = pctx->poll_burst_max;
1076
1077 lwkt_replymsg(&msg->nm_lmsg, 0);
1078}
e79f3356
SZ
1079
1080static void
1081poll_sysctl_eachburst(struct netmsg *msg)
1082{
1083 struct pollctx *pctx;
1084 uint32_t each_burst;
1085 int cpuid;
1086
1087 cpuid = mycpu->gd_cpuid;
1088 KKASSERT(cpuid < POLLCTX_MAX);
1089
1090 pctx = poll_context[cpuid];
1091 KKASSERT(pctx != NULL);
1092 KKASSERT(pctx->poll_cpuid == cpuid);
1093
1094 each_burst = msg->nm_lmsg.u.ms_result;
1095 if (each_burst > pctx->poll_burst_max)
1096 each_burst = pctx->poll_burst_max;
1097 else if (each_burst < 1)
1098 each_burst = 1;
1099 pctx->poll_each_burst = each_burst;
1100
1101 lwkt_replymsg(&msg->nm_lmsg, 0);
1102}