Modify ktr(4) to be typesafe
[dragonfly.git] / sys / kern / kern_poll.c
... / ...
CommitLineData
1/*-
2 * Copyright (c) 2001-2002 Luigi Rizzo
3 *
4 * Supported by: the Xorp Project (www.xorp.org)
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: src/sys/kern/kern_poll.c,v 1.2.2.4 2002/06/27 23:26:33 luigi Exp $
28 * $DragonFly: src/sys/kern/kern_poll.c,v 1.48 2008/09/24 12:07:19 sephe Exp $
29 */
30
31#include "opt_polling.h"
32
33#include <sys/param.h>
34#include <sys/kernel.h>
35#include <sys/ktr.h>
36#include <sys/socket.h> /* needed by net/if.h */
37#include <sys/sysctl.h>
38
39#include <sys/thread2.h>
40#include <sys/msgport2.h>
41
42#include <net/if.h> /* for IFF_* flags */
43#include <net/netmsg2.h>
44
45/*
46 * Polling support for [network] device drivers.
47 *
48 * Drivers which support this feature try to register with the
49 * polling code.
50 *
51 * If registration is successful, the driver must disable interrupts,
52 * and further I/O is performed through the handler, which is invoked
53 * (at least once per clock tick) with 3 arguments: the "arg" passed at
54 * register time (a struct ifnet pointer), a command, and a "count" limit.
55 *
56 * The command can be one of the following:
57 * POLL_ONLY: quick move of "count" packets from input/output queues.
58 * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
59 * other more expensive operations. This command is issued periodically
60 * but less frequently than POLL_ONLY.
61 * POLL_DEREGISTER: deregister and return to interrupt mode.
62 * POLL_REGISTER: register and disable interrupts
63 *
64 * The first two commands are only issued if the interface is marked as
65 * 'IFF_UP, IFF_RUNNING and IFF_POLLING', the last two only if IFF_RUNNING
66 * is set.
67 *
68 * The count limit specifies how much work the handler can do during the
69 * call -- typically this is the number of packets to be received, or
70 * transmitted, etc. (drivers are free to interpret this number, as long
71 * as the max time spent in the function grows roughly linearly with the
72 * count).
73 *
74 * Deregistration can be requested by the driver itself (typically in the
75 * *_stop() routine), or by the polling code, by invoking the handler.
76 *
77 * Polling can be enabled or disabled on particular CPU_X with the sysctl
78 * variable kern.polling.X.enable (default is 1, enabled)
79 *
80 * A second variable controls the sharing of CPU between polling/kernel
81 * network processing, and other activities (typically userlevel tasks):
82 * kern.polling.X.user_frac (between 0 and 100, default 50) sets the share
83 * of CPU allocated to user tasks. CPU is allocated proportionally to the
84 * shares, by dynamically adjusting the "count" (poll_burst).
85 *
86 * Other parameters can should be left to their default values.
87 * The following constraints hold
88 *
89 * 1 <= poll_burst <= poll_burst_max
90 * 1 <= poll_each_burst <= poll_burst_max
91 * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
92 */
93
94#define MIN_POLL_BURST_MAX 10
95#define MAX_POLL_BURST_MAX 1000
96#define POLL_BURST_MAX 150 /* good for 100Mbit net and HZ=1000 */
97#define POLL_EACH_BURST 5
98
99#ifndef DEVICE_POLLING_FREQ_MAX
100#define DEVICE_POLLING_FREQ_MAX 30000
101#endif
102#define DEVICE_POLLING_FREQ_DEFAULT 2000
103
104#define POLL_LIST_LEN 128
105struct pollrec {
106 struct ifnet *ifp;
107};
108
109#define POLLCTX_MAX 32
110
111struct pollctx {
112 struct sysctl_ctx_list poll_sysctl_ctx;
113 struct sysctl_oid *poll_sysctl_tree;
114
115 uint32_t poll_burst; /* state */
116 uint32_t poll_each_burst; /* tunable */
117 uint32_t poll_burst_max; /* tunable */
118 uint32_t user_frac; /* tunable */
119 int reg_frac_count; /* state */
120 uint32_t reg_frac; /* tunable */
121 uint32_t short_ticks; /* statistics */
122 uint32_t lost_polls; /* statistics */
123 uint32_t pending_polls; /* state */
124 int residual_burst; /* state */
125 uint32_t phase; /* state */
126 uint32_t suspect; /* statistics */
127 uint32_t stalled; /* statistics */
128 struct timeval poll_start_t; /* state */
129 struct timeval prev_t; /* state */
130
131 uint32_t poll_handlers; /* next free entry in pr[]. */
132 struct pollrec pr[POLL_LIST_LEN];
133
134 int poll_cpuid;
135 struct systimer pollclock;
136 int polling_enabled; /* tunable */
137 int pollhz; /* tunable */
138
139 struct netmsg_base poll_netmsg;
140 struct netmsg_base poll_more_netmsg;
141};
142
143static struct pollctx *poll_context[POLLCTX_MAX];
144
145SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
146 "Device polling parameters");
147
148static int poll_defcpu = -1;
149SYSCTL_INT(_kern_polling, OID_AUTO, defcpu, CTLFLAG_RD,
150 &poll_defcpu, 0, "default CPU to run device polling");
151
152static cpumask_t poll_cpumask0 = (cpumask_t)-1;
153TUNABLE_ULONG("kern.polling.cpumask", (u_long *)&poll_cpumask0);
154
155static cpumask_t poll_cpumask;
156SYSCTL_LONG(_kern_polling, OID_AUTO, cpumask, CTLFLAG_RD,
157 &poll_cpumask, 0, "CPUs that can run device polling");
158
159static int polling_enabled = 1; /* global polling enable */
160TUNABLE_INT("kern.polling.enable", &polling_enabled);
161
162static int pollhz = DEVICE_POLLING_FREQ_DEFAULT;
163TUNABLE_INT("kern.polling.pollhz", &pollhz);
164
165static int poll_burst_max = POLL_BURST_MAX;
166TUNABLE_INT("kern.polling.burst_max", &poll_burst_max);
167
168static int poll_each_burst = POLL_EACH_BURST;
169TUNABLE_INT("kern.polling.each_burst", &poll_each_burst);
170
171/* Netisr handlers */
172static void netisr_poll(netmsg_t);
173static void netisr_pollmore(netmsg_t);
174static void poll_register(netmsg_t);
175static void poll_deregister(netmsg_t);
176static void poll_sysctl_pollhz(netmsg_t);
177static void poll_sysctl_polling(netmsg_t);
178static void poll_sysctl_regfrac(netmsg_t);
179static void poll_sysctl_burstmax(netmsg_t);
180static void poll_sysctl_eachburst(netmsg_t);
181
182/* Systimer handler */
183static void pollclock(systimer_t, int, struct intrframe *);
184
185/* Sysctl handlers */
186static int sysctl_pollhz(SYSCTL_HANDLER_ARGS);
187static int sysctl_polling(SYSCTL_HANDLER_ARGS);
188static int sysctl_regfrac(SYSCTL_HANDLER_ARGS);
189static int sysctl_burstmax(SYSCTL_HANDLER_ARGS);
190static int sysctl_eachburst(SYSCTL_HANDLER_ARGS);
191static void poll_add_sysctl(struct sysctl_ctx_list *,
192 struct sysctl_oid_list *, struct pollctx *);
193
194void init_device_poll_pcpu(int); /* per-cpu init routine */
195
196#define POLL_KTR_STRING "ifp=%p"
197#define POLL_KTR_ARG_SIZE (sizeof(void *))
198
199#ifndef KTR_POLLING
200#define KTR_POLLING KTR_ALL
201#endif
202KTR_INFO_MASTER(poll);
203KTR_INFO(KTR_POLLING, poll, beg, 0, "ifp=%p", void *ifp);
204KTR_INFO(KTR_POLLING, poll, end, 1, "ifp=%p", void *ifp);
205
206#define logpoll(name, arg) KTR_LOG(poll_ ## name, arg)
207
208static __inline void
209poll_reset_state(struct pollctx *pctx)
210{
211 crit_enter();
212 pctx->poll_burst = 5;
213 pctx->reg_frac_count = 0;
214 pctx->pending_polls = 0;
215 pctx->residual_burst = 0;
216 pctx->phase = 0;
217 bzero(&pctx->poll_start_t, sizeof(pctx->poll_start_t));
218 bzero(&pctx->prev_t, sizeof(pctx->prev_t));
219 crit_exit();
220}
221
222/*
223 * Initialize per-cpu polling(4) context. Called from kern_clock.c:
224 */
225void
226init_device_poll_pcpu(int cpuid)
227{
228 struct pollctx *pctx;
229 char cpuid_str[3];
230
231 if (cpuid >= POLLCTX_MAX)
232 return;
233
234 if ((CPUMASK(cpuid) & poll_cpumask0) == 0)
235 return;
236
237 if (poll_burst_max < MIN_POLL_BURST_MAX)
238 poll_burst_max = MIN_POLL_BURST_MAX;
239 else if (poll_burst_max > MAX_POLL_BURST_MAX)
240 poll_burst_max = MAX_POLL_BURST_MAX;
241
242 if (poll_each_burst > poll_burst_max)
243 poll_each_burst = poll_burst_max;
244
245 poll_cpumask |= CPUMASK(cpuid);
246
247 pctx = kmalloc(sizeof(*pctx), M_DEVBUF, M_WAITOK | M_ZERO);
248
249 pctx->poll_each_burst = poll_each_burst;
250 pctx->poll_burst_max = poll_burst_max;
251 pctx->user_frac = 50;
252 pctx->reg_frac = 20;
253 pctx->polling_enabled = polling_enabled;
254 pctx->pollhz = pollhz;
255 pctx->poll_cpuid = cpuid;
256 poll_reset_state(pctx);
257
258 netmsg_init(&pctx->poll_netmsg, NULL, &netisr_adone_rport,
259 0, netisr_poll);
260#ifdef INVARIANTS
261 pctx->poll_netmsg.lmsg.u.ms_resultp = pctx;
262#endif
263
264 netmsg_init(&pctx->poll_more_netmsg, NULL, &netisr_adone_rport,
265 0, netisr_pollmore);
266#ifdef INVARIANTS
267 pctx->poll_more_netmsg.lmsg.u.ms_resultp = pctx;
268#endif
269
270 KASSERT(cpuid < POLLCTX_MAX, ("cpu id must < %d", cpuid));
271 poll_context[cpuid] = pctx;
272
273 if (poll_defcpu < 0) {
274 poll_defcpu = cpuid;
275
276 /*
277 * Initialize global sysctl nodes, for compat
278 */
279 poll_add_sysctl(NULL, SYSCTL_STATIC_CHILDREN(_kern_polling),
280 pctx);
281 }
282
283 /*
284 * Initialize per-cpu sysctl nodes
285 */
286 ksnprintf(cpuid_str, sizeof(cpuid_str), "%d", pctx->poll_cpuid);
287
288 sysctl_ctx_init(&pctx->poll_sysctl_ctx);
289 pctx->poll_sysctl_tree = SYSCTL_ADD_NODE(&pctx->poll_sysctl_ctx,
290 SYSCTL_STATIC_CHILDREN(_kern_polling),
291 OID_AUTO, cpuid_str, CTLFLAG_RD, 0, "");
292 poll_add_sysctl(&pctx->poll_sysctl_ctx,
293 SYSCTL_CHILDREN(pctx->poll_sysctl_tree), pctx);
294
295 /*
296 * Initialize systimer
297 */
298 systimer_init_periodic_nq(&pctx->pollclock, pollclock, pctx, 1);
299}
300
301static void
302schedpoll_oncpu(netmsg_t msg)
303{
304 if (msg->lmsg.ms_flags & MSGF_DONE)
305 lwkt_sendmsg(cpu_portfn(mycpuid), &msg->lmsg);
306}
307
308static __inline void
309schedpoll(struct pollctx *pctx)
310{
311 crit_enter();
312 schedpoll_oncpu((netmsg_t)&pctx->poll_netmsg);
313 crit_exit();
314}
315
316static __inline void
317schedpollmore(struct pollctx *pctx)
318{
319 schedpoll_oncpu((netmsg_t)&pctx->poll_more_netmsg);
320}
321
322/*
323 * Set the polling frequency
324 */
325static int
326sysctl_pollhz(SYSCTL_HANDLER_ARGS)
327{
328 struct pollctx *pctx = arg1;
329 struct netmsg_base msg;
330 lwkt_port_t port;
331 int error, phz;
332
333 phz = pctx->pollhz;
334 error = sysctl_handle_int(oidp, &phz, 0, req);
335 if (error || req->newptr == NULL)
336 return error;
337 if (phz <= 0)
338 return EINVAL;
339 else if (phz > DEVICE_POLLING_FREQ_MAX)
340 phz = DEVICE_POLLING_FREQ_MAX;
341
342 netmsg_init(&msg, NULL, &curthread->td_msgport,
343 0, poll_sysctl_pollhz);
344 msg.lmsg.u.ms_result = phz;
345
346 port = cpu_portfn(pctx->poll_cpuid);
347 lwkt_domsg(port, &msg.lmsg, 0);
348 return 0;
349}
350
351/*
352 * Master enable.
353 */
354static int
355sysctl_polling(SYSCTL_HANDLER_ARGS)
356{
357 struct pollctx *pctx = arg1;
358 struct netmsg_base msg;
359 lwkt_port_t port;
360 int error, enabled;
361
362 enabled = pctx->polling_enabled;
363 error = sysctl_handle_int(oidp, &enabled, 0, req);
364 if (error || req->newptr == NULL)
365 return error;
366
367 netmsg_init(&msg, NULL, &curthread->td_msgport,
368 0, poll_sysctl_polling);
369 msg.lmsg.u.ms_result = enabled;
370
371 port = cpu_portfn(pctx->poll_cpuid);
372 lwkt_domsg(port, &msg.lmsg, 0);
373 return 0;
374}
375
376static int
377sysctl_regfrac(SYSCTL_HANDLER_ARGS)
378{
379 struct pollctx *pctx = arg1;
380 struct netmsg_base msg;
381 lwkt_port_t port;
382 uint32_t reg_frac;
383 int error;
384
385 reg_frac = pctx->reg_frac;
386 error = sysctl_handle_int(oidp, &reg_frac, 0, req);
387 if (error || req->newptr == NULL)
388 return error;
389
390 netmsg_init(&msg, NULL, &curthread->td_msgport,
391 0, poll_sysctl_regfrac);
392 msg.lmsg.u.ms_result = reg_frac;
393
394 port = cpu_portfn(pctx->poll_cpuid);
395 lwkt_domsg(port, &msg.lmsg, 0);
396 return 0;
397}
398
399static int
400sysctl_burstmax(SYSCTL_HANDLER_ARGS)
401{
402 struct pollctx *pctx = arg1;
403 struct netmsg_base msg;
404 lwkt_port_t port;
405 uint32_t burst_max;
406 int error;
407
408 burst_max = pctx->poll_burst_max;
409 error = sysctl_handle_int(oidp, &burst_max, 0, req);
410 if (error || req->newptr == NULL)
411 return error;
412 if (burst_max < MIN_POLL_BURST_MAX)
413 burst_max = MIN_POLL_BURST_MAX;
414 else if (burst_max > MAX_POLL_BURST_MAX)
415 burst_max = MAX_POLL_BURST_MAX;
416
417 netmsg_init(&msg, NULL, &curthread->td_msgport,
418 0, poll_sysctl_burstmax);
419 msg.lmsg.u.ms_result = burst_max;
420
421 port = cpu_portfn(pctx->poll_cpuid);
422 lwkt_domsg(port, &msg.lmsg, 0);
423 return 0;
424}
425
426static int
427sysctl_eachburst(SYSCTL_HANDLER_ARGS)
428{
429 struct pollctx *pctx = arg1;
430 struct netmsg_base msg;
431 lwkt_port_t port;
432 uint32_t each_burst;
433 int error;
434
435 each_burst = pctx->poll_each_burst;
436 error = sysctl_handle_int(oidp, &each_burst, 0, req);
437 if (error || req->newptr == NULL)
438 return error;
439
440 netmsg_init(&msg, NULL, &curthread->td_msgport,
441 0, poll_sysctl_eachburst);
442 msg.lmsg.u.ms_result = each_burst;
443
444 port = cpu_portfn(pctx->poll_cpuid);
445 lwkt_domsg(port, &msg.lmsg, 0);
446 return 0;
447}
448
449/*
450 * Hook from polling systimer. Tries to schedule a netisr, but keeps
451 * track of lost ticks due to the previous handler taking too long.
452 * Normally, this should not happen, because polling handler should
453 * run for a short time. However, in some cases (e.g. when there are
454 * changes in link status etc.) the drivers take a very long time
455 * (even in the order of milliseconds) to reset and reconfigure the
456 * device, causing apparent lost polls.
457 *
458 * The first part of the code is just for debugging purposes, and tries
459 * to count how often hardclock ticks are shorter than they should,
460 * meaning either stray interrupts or delayed events.
461 *
462 * WARNING! called from fastint or IPI, the MP lock might not be held.
463 */
464static void
465pollclock(systimer_t info, int in_ipi __unused,
466 struct intrframe *frame __unused)
467{
468 struct pollctx *pctx = info->data;
469 struct timeval t;
470 int delta;
471
472 if (pctx->poll_handlers == 0)
473 return;
474
475 microuptime(&t);
476 delta = (t.tv_usec - pctx->prev_t.tv_usec) +
477 (t.tv_sec - pctx->prev_t.tv_sec)*1000000;
478 if (delta * pctx->pollhz < 500000)
479 pctx->short_ticks++;
480 else
481 pctx->prev_t = t;
482
483 if (pctx->pending_polls > 100) {
484 /*
485 * Too much, assume it has stalled (not always true
486 * see comment above).
487 */
488 pctx->stalled++;
489 pctx->pending_polls = 0;
490 pctx->phase = 0;
491 }
492
493 if (pctx->phase <= 2) {
494 if (pctx->phase != 0)
495 pctx->suspect++;
496 pctx->phase = 1;
497 schedpoll(pctx);
498 pctx->phase = 2;
499 }
500 if (pctx->pending_polls++ > 0)
501 pctx->lost_polls++;
502}
503
504/*
505 * netisr_pollmore is called after other netisr's, possibly scheduling
506 * another NETISR_POLL call, or adapting the burst size for the next cycle.
507 *
508 * It is very bad to fetch large bursts of packets from a single card at once,
509 * because the burst could take a long time to be completely processed leading
510 * to unfairness. To reduce the problem, and also to account better for time
511 * spent in network-related processing, we split the burst in smaller chunks
512 * of fixed size, giving control to the other netisr's between chunks. This
513 * helps in improving the fairness, reducing livelock (because we emulate more
514 * closely the "process to completion" that we have with fastforwarding) and
515 * accounting for the work performed in low level handling and forwarding.
516 */
517
518/* ARGSUSED */
519static void
520netisr_pollmore(netmsg_t msg)
521{
522 struct pollctx *pctx;
523 struct timeval t;
524 int kern_load, cpuid;
525 uint32_t pending_polls;
526
527 cpuid = mycpu->gd_cpuid;
528 KKASSERT(cpuid < POLLCTX_MAX);
529
530 pctx = poll_context[cpuid];
531 KKASSERT(pctx != NULL);
532 KKASSERT(pctx->poll_cpuid == cpuid);
533 KKASSERT(pctx == msg->lmsg.u.ms_resultp);
534
535 lwkt_replymsg(&msg->lmsg, 0);
536
537 if (pctx->poll_handlers == 0)
538 return;
539
540 KASSERT(pctx->polling_enabled,
541 ("# of registered poll handlers are not zero, "
542 "but polling is not enabled\n"));
543
544 pctx->phase = 5;
545 if (pctx->residual_burst > 0) {
546 schedpoll(pctx);
547 /* will run immediately on return, followed by netisrs */
548 return;
549 }
550 /* here we can account time spent in netisr's in this tick */
551 microuptime(&t);
552 kern_load = (t.tv_usec - pctx->poll_start_t.tv_usec) +
553 (t.tv_sec - pctx->poll_start_t.tv_sec)*1000000; /* us */
554 kern_load = (kern_load * pctx->pollhz) / 10000; /* 0..100 */
555 if (kern_load > (100 - pctx->user_frac)) { /* try decrease ticks */
556 if (pctx->poll_burst > 1)
557 pctx->poll_burst--;
558 } else {
559 if (pctx->poll_burst < pctx->poll_burst_max)
560 pctx->poll_burst++;
561 }
562
563 crit_enter();
564 pctx->pending_polls--;
565 pending_polls = pctx->pending_polls;
566 crit_exit();
567
568 if (pending_polls == 0) { /* we are done */
569 pctx->phase = 0;
570 } else {
571 /*
572 * Last cycle was long and caused us to miss one or more
573 * hardclock ticks. Restart processing again, but slightly
574 * reduce the burst size to prevent that this happens again.
575 */
576 pctx->poll_burst -= (pctx->poll_burst / 8);
577 if (pctx->poll_burst < 1)
578 pctx->poll_burst = 1;
579 schedpoll(pctx);
580 pctx->phase = 6;
581 }
582}
583
584/*
585 * netisr_poll is scheduled by schedpoll when appropriate, typically once
586 * per polling systimer tick.
587 *
588 * Note that the message is replied immediately in order to allow a new
589 * ISR to be scheduled in the handler.
590 *
591 * XXX each registration should indicate whether it needs a critical
592 * section to operate.
593 */
594/* ARGSUSED */
595static void
596netisr_poll(netmsg_t msg)
597{
598 struct pollctx *pctx;
599 int i, cycles, cpuid;
600 enum poll_cmd arg = POLL_ONLY;
601
602 cpuid = mycpu->gd_cpuid;
603 KKASSERT(cpuid < POLLCTX_MAX);
604
605 pctx = poll_context[cpuid];
606 KKASSERT(pctx != NULL);
607 KKASSERT(pctx->poll_cpuid == cpuid);
608 KKASSERT(pctx == msg->lmsg.u.ms_resultp);
609
610 crit_enter();
611 lwkt_replymsg(&msg->lmsg, 0);
612 crit_exit();
613
614 if (pctx->poll_handlers == 0)
615 return;
616
617 KASSERT(pctx->polling_enabled,
618 ("# of registered poll handlers are not zero, "
619 "but polling is not enabled\n"));
620
621 pctx->phase = 3;
622 if (pctx->residual_burst == 0) { /* first call in this tick */
623 microuptime(&pctx->poll_start_t);
624
625 if (pctx->reg_frac_count-- == 0) {
626 arg = POLL_AND_CHECK_STATUS;
627 pctx->reg_frac_count = pctx->reg_frac - 1;
628 }
629
630 pctx->residual_burst = pctx->poll_burst;
631 }
632 cycles = (pctx->residual_burst < pctx->poll_each_burst) ?
633 pctx->residual_burst : pctx->poll_each_burst;
634 pctx->residual_burst -= cycles;
635
636 for (i = 0 ; i < pctx->poll_handlers ; i++) {
637 struct ifnet *ifp = pctx->pr[i].ifp;
638
639 if (!ifnet_tryserialize_main(ifp))
640 continue;
641
642 if ((ifp->if_flags & (IFF_UP|IFF_RUNNING|IFF_POLLING))
643 == (IFF_UP|IFF_RUNNING|IFF_POLLING)) {
644 logpoll(beg, ifp);
645 crit_enter();
646 ifp->if_poll(ifp, arg, cycles);
647 crit_exit();
648 logpoll(end, ifp);
649 }
650
651 ifnet_deserialize_main(ifp);
652 }
653
654 schedpollmore(pctx);
655 pctx->phase = 4;
656}
657
658static void
659poll_register(netmsg_t msg)
660{
661 struct ifnet *ifp = msg->lmsg.u.ms_resultp;
662 struct pollctx *pctx;
663 int rc, cpuid;
664
665 cpuid = mycpu->gd_cpuid;
666 KKASSERT(cpuid < POLLCTX_MAX);
667
668 pctx = poll_context[cpuid];
669 KKASSERT(pctx != NULL);
670 KKASSERT(pctx->poll_cpuid == cpuid);
671
672 if (pctx->polling_enabled == 0) {
673 /* Polling disabled, cannot register */
674 rc = EOPNOTSUPP;
675 goto back;
676 }
677
678 /*
679 * Check if there is room.
680 */
681 if (pctx->poll_handlers >= POLL_LIST_LEN) {
682 /*
683 * List full, cannot register more entries.
684 * This should never happen; if it does, it is probably a
685 * broken driver trying to register multiple times. Checking
686 * this at runtime is expensive, and won't solve the problem
687 * anyways, so just report a few times and then give up.
688 */
689 static int verbose = 10; /* XXX */
690 if (verbose >0) {
691 kprintf("poll handlers list full, "
692 "maybe a broken driver ?\n");
693 verbose--;
694 }
695 rc = ENOMEM;
696 } else {
697 pctx->pr[pctx->poll_handlers].ifp = ifp;
698 pctx->poll_handlers++;
699 rc = 0;
700
701 if (pctx->poll_handlers == 1) {
702 KKASSERT(pctx->polling_enabled);
703 systimer_adjust_periodic(&pctx->pollclock,
704 pctx->pollhz);
705 }
706 }
707back:
708 lwkt_replymsg(&msg->lmsg, rc);
709}
710
711/*
712 * Try to register routine for polling. Returns 1 if successful
713 * (and polling should be enabled), 0 otherwise.
714 *
715 * Called from mainline code only, not called from an interrupt.
716 */
717int
718ether_poll_register(struct ifnet *ifp)
719{
720 if (poll_defcpu < 0)
721 return 0;
722 KKASSERT(poll_defcpu < POLLCTX_MAX);
723
724 return ether_pollcpu_register(ifp, poll_defcpu);
725}
726
727int
728ether_pollcpu_register(struct ifnet *ifp, int cpuid)
729{
730 struct netmsg_base msg;
731 lwkt_port_t port;
732 int rc;
733
734 if (ifp->if_poll == NULL) {
735 /* Device does not support polling */
736 return 0;
737 }
738
739 if (cpuid < 0 || cpuid >= POLLCTX_MAX)
740 return 0;
741
742 if ((CPUMASK(cpuid) & poll_cpumask) == 0) {
743 /* Polling is not supported on 'cpuid' */
744 return 0;
745 }
746 KKASSERT(poll_context[cpuid] != NULL);
747
748 /*
749 * Attempt to register. Interlock with IFF_POLLING.
750 */
751 crit_enter(); /* XXX MP - not mp safe */
752
753 ifnet_serialize_all(ifp);
754 if (ifp->if_flags & IFF_POLLING) {
755 /* Already polling */
756 KKASSERT(ifp->if_poll_cpuid >= 0);
757 ifnet_deserialize_all(ifp);
758 crit_exit();
759 return 0;
760 }
761 KKASSERT(ifp->if_poll_cpuid < 0);
762 ifp->if_flags |= IFF_POLLING;
763 ifp->if_poll_cpuid = cpuid;
764 if (ifp->if_flags & IFF_RUNNING)
765 ifp->if_poll(ifp, POLL_REGISTER, 0);
766 ifnet_deserialize_all(ifp);
767
768 netmsg_init(&msg, NULL, &curthread->td_msgport,
769 0, poll_register);
770 msg.lmsg.u.ms_resultp = ifp;
771
772 port = cpu_portfn(cpuid);
773 lwkt_domsg(port, &msg.lmsg, 0);
774
775 if (msg.lmsg.ms_error) {
776 ifnet_serialize_all(ifp);
777 ifp->if_flags &= ~IFF_POLLING;
778 ifp->if_poll_cpuid = -1;
779 if (ifp->if_flags & IFF_RUNNING)
780 ifp->if_poll(ifp, POLL_DEREGISTER, 0);
781 ifnet_deserialize_all(ifp);
782 rc = 0;
783 } else {
784 rc = 1;
785 }
786
787 crit_exit();
788 return rc;
789}
790
791static void
792poll_deregister(netmsg_t msg)
793{
794 struct ifnet *ifp = msg->lmsg.u.ms_resultp;
795 struct pollctx *pctx;
796 int rc, i, cpuid;
797
798 cpuid = mycpu->gd_cpuid;
799 KKASSERT(cpuid < POLLCTX_MAX);
800
801 pctx = poll_context[cpuid];
802 KKASSERT(pctx != NULL);
803 KKASSERT(pctx->poll_cpuid == cpuid);
804
805 for (i = 0 ; i < pctx->poll_handlers ; i++) {
806 if (pctx->pr[i].ifp == ifp) /* Found it */
807 break;
808 }
809 if (i == pctx->poll_handlers) {
810 kprintf("ether_poll_deregister: ifp not found!!!\n");
811 rc = ENOENT;
812 } else {
813 pctx->poll_handlers--;
814 if (i < pctx->poll_handlers) {
815 /* Last entry replaces this one. */
816 pctx->pr[i].ifp = pctx->pr[pctx->poll_handlers].ifp;
817 }
818
819 if (pctx->poll_handlers == 0) {
820 systimer_adjust_periodic(&pctx->pollclock, 1);
821 poll_reset_state(pctx);
822 }
823 rc = 0;
824 }
825 lwkt_replymsg(&msg->lmsg, rc);
826}
827
828/*
829 * Remove interface from the polling list. Occurs when polling is turned
830 * off. Called from mainline code only, not called from an interrupt.
831 */
832int
833ether_poll_deregister(struct ifnet *ifp)
834{
835 struct netmsg_base msg;
836 lwkt_port_t port;
837 int rc, cpuid;
838
839 KKASSERT(ifp != NULL);
840
841 if (ifp->if_poll == NULL)
842 return 0;
843
844 crit_enter();
845
846 ifnet_serialize_all(ifp);
847 if ((ifp->if_flags & IFF_POLLING) == 0) {
848 KKASSERT(ifp->if_poll_cpuid < 0);
849 ifnet_deserialize_all(ifp);
850 crit_exit();
851 return 0;
852 }
853
854 cpuid = ifp->if_poll_cpuid;
855 KKASSERT(cpuid >= 0);
856 KKASSERT(poll_context[cpuid] != NULL);
857
858 ifp->if_flags &= ~IFF_POLLING;
859 ifp->if_poll_cpuid = -1;
860 ifnet_deserialize_all(ifp);
861
862 netmsg_init(&msg, NULL, &curthread->td_msgport,
863 0, poll_deregister);
864 msg.lmsg.u.ms_resultp = ifp;
865
866 port = cpu_portfn(cpuid);
867 lwkt_domsg(port, &msg.lmsg, 0);
868
869 if (!msg.lmsg.ms_error) {
870 ifnet_serialize_all(ifp);
871 if (ifp->if_flags & IFF_RUNNING)
872 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
873 ifnet_deserialize_all(ifp);
874 rc = 1;
875 } else {
876 rc = 0;
877 }
878
879 crit_exit();
880 return rc;
881}
882
883static void
884poll_add_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent,
885 struct pollctx *pctx)
886{
887 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "enable",
888 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_polling,
889 "I", "Polling enabled");
890
891 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "pollhz",
892 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_pollhz,
893 "I", "Device polling frequency");
894
895 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "reg_frac",
896 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_regfrac,
897 "IU", "Every this many cycles poll register");
898
899 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "burst_max",
900 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_burstmax,
901 "IU", "Max Polling burst size");
902
903 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "each_burst",
904 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_eachburst,
905 "IU", "Max size of each burst");
906
907 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "phase", CTLFLAG_RD,
908 &pctx->phase, 0, "Polling phase");
909
910 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "suspect", CTLFLAG_RW,
911 &pctx->suspect, 0, "suspect event");
912
913 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "stalled", CTLFLAG_RW,
914 &pctx->stalled, 0, "potential stalls");
915
916 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "burst", CTLFLAG_RD,
917 &pctx->poll_burst, 0, "Current polling burst size");
918
919 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "user_frac", CTLFLAG_RW,
920 &pctx->user_frac, 0,
921 "Desired user fraction of cpu time");
922
923 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "short_ticks", CTLFLAG_RW,
924 &pctx->short_ticks, 0,
925 "Hardclock ticks shorter than they should be");
926
927 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "lost_polls", CTLFLAG_RW,
928 &pctx->lost_polls, 0,
929 "How many times we would have lost a poll tick");
930
931 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "pending_polls", CTLFLAG_RD,
932 &pctx->pending_polls, 0, "Do we need to poll again");
933
934 SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "residual_burst", CTLFLAG_RD,
935 &pctx->residual_burst, 0,
936 "# of residual cycles in burst");
937
938 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "handlers", CTLFLAG_RD,
939 &pctx->poll_handlers, 0,
940 "Number of registered poll handlers");
941}
942
943static void
944poll_sysctl_pollhz(netmsg_t msg)
945{
946 struct pollctx *pctx;
947 int cpuid;
948
949 cpuid = mycpu->gd_cpuid;
950 KKASSERT(cpuid < POLLCTX_MAX);
951
952 pctx = poll_context[cpuid];
953 KKASSERT(pctx != NULL);
954 KKASSERT(pctx->poll_cpuid == cpuid);
955
956 /*
957 * If polling is disabled or there is no device registered,
958 * don't adjust polling systimer frequency.
959 * Polling systimer frequency will be adjusted once polling
960 * is enabled and there are registered devices.
961 */
962 pctx->pollhz = msg->lmsg.u.ms_result;
963 if (pctx->polling_enabled && pctx->poll_handlers)
964 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
965
966 /*
967 * Make sure that reg_frac and reg_frac_count are within valid range.
968 */
969 if (pctx->reg_frac > pctx->pollhz) {
970 pctx->reg_frac = pctx->pollhz;
971 if (pctx->reg_frac_count > pctx->reg_frac)
972 pctx->reg_frac_count = pctx->reg_frac - 1;
973 }
974
975 lwkt_replymsg(&msg->lmsg, 0);
976}
977
978static void
979poll_sysctl_polling(netmsg_t msg)
980{
981 struct pollctx *pctx;
982 int cpuid;
983
984 cpuid = mycpu->gd_cpuid;
985 KKASSERT(cpuid < POLLCTX_MAX);
986
987 pctx = poll_context[cpuid];
988 KKASSERT(pctx != NULL);
989 KKASSERT(pctx->poll_cpuid == cpuid);
990
991 /*
992 * If polling is disabled or there is no device registered,
993 * cut the polling systimer frequency to 1hz.
994 */
995 pctx->polling_enabled = msg->lmsg.u.ms_result;
996 if (pctx->polling_enabled && pctx->poll_handlers) {
997 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
998 } else {
999 systimer_adjust_periodic(&pctx->pollclock, 1);
1000 poll_reset_state(pctx);
1001 }
1002
1003 if (!pctx->polling_enabled && pctx->poll_handlers != 0) {
1004 int i;
1005
1006 for (i = 0 ; i < pctx->poll_handlers ; i++) {
1007 struct ifnet *ifp = pctx->pr[i].ifp;
1008
1009 ifnet_serialize_all(ifp);
1010
1011 if ((ifp->if_flags & IFF_POLLING) == 0) {
1012 KKASSERT(ifp->if_poll_cpuid < 0);
1013 ifnet_deserialize_all(ifp);
1014 continue;
1015 }
1016 ifp->if_flags &= ~IFF_POLLING;
1017 ifp->if_poll_cpuid = -1;
1018
1019 /*
1020 * Only call the interface deregistration
1021 * function if the interface is still
1022 * running.
1023 */
1024 if (ifp->if_flags & IFF_RUNNING)
1025 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
1026
1027 ifnet_deserialize_all(ifp);
1028 }
1029 pctx->poll_handlers = 0;
1030 }
1031
1032 lwkt_replymsg(&msg->lmsg, 0);
1033}
1034
1035static void
1036poll_sysctl_regfrac(netmsg_t msg)
1037{
1038 struct pollctx *pctx;
1039 uint32_t reg_frac;
1040 int cpuid;
1041
1042 cpuid = mycpu->gd_cpuid;
1043 KKASSERT(cpuid < POLLCTX_MAX);
1044
1045 pctx = poll_context[cpuid];
1046 KKASSERT(pctx != NULL);
1047 KKASSERT(pctx->poll_cpuid == cpuid);
1048
1049 reg_frac = msg->lmsg.u.ms_result;
1050 if (reg_frac > pctx->pollhz)
1051 reg_frac = pctx->pollhz;
1052 else if (reg_frac < 1)
1053 reg_frac = 1;
1054
1055 pctx->reg_frac = reg_frac;
1056 if (pctx->reg_frac_count > pctx->reg_frac)
1057 pctx->reg_frac_count = pctx->reg_frac - 1;
1058
1059 lwkt_replymsg(&msg->lmsg, 0);
1060}
1061
1062static void
1063poll_sysctl_burstmax(netmsg_t msg)
1064{
1065 struct pollctx *pctx;
1066 int cpuid;
1067
1068 cpuid = mycpu->gd_cpuid;
1069 KKASSERT(cpuid < POLLCTX_MAX);
1070
1071 pctx = poll_context[cpuid];
1072 KKASSERT(pctx != NULL);
1073 KKASSERT(pctx->poll_cpuid == cpuid);
1074
1075 pctx->poll_burst_max = msg->lmsg.u.ms_result;
1076 if (pctx->poll_each_burst > pctx->poll_burst_max)
1077 pctx->poll_each_burst = pctx->poll_burst_max;
1078 if (pctx->poll_burst > pctx->poll_burst_max)
1079 pctx->poll_burst = pctx->poll_burst_max;
1080 if (pctx->residual_burst > pctx->poll_burst_max)
1081 pctx->residual_burst = pctx->poll_burst_max;
1082
1083 lwkt_replymsg(&msg->lmsg, 0);
1084}
1085
1086static void
1087poll_sysctl_eachburst(netmsg_t msg)
1088{
1089 struct pollctx *pctx;
1090 uint32_t each_burst;
1091 int cpuid;
1092
1093 cpuid = mycpu->gd_cpuid;
1094 KKASSERT(cpuid < POLLCTX_MAX);
1095
1096 pctx = poll_context[cpuid];
1097 KKASSERT(pctx != NULL);
1098 KKASSERT(pctx->poll_cpuid == cpuid);
1099
1100 each_burst = msg->lmsg.u.ms_result;
1101 if (each_burst > pctx->poll_burst_max)
1102 each_burst = pctx->poll_burst_max;
1103 else if (each_burst < 1)
1104 each_burst = 1;
1105 pctx->poll_each_burst = each_burst;
1106
1107 lwkt_replymsg(&msg->lmsg, 0);
1108}