Mark all msgs used by polling(4) as MPSAFE ones
[dragonfly.git] / sys / kern / kern_poll.c
... / ...
CommitLineData
1/*-
2 * Copyright (c) 2001-2002 Luigi Rizzo
3 *
4 * Supported by: the Xorp Project (www.xorp.org)
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: src/sys/kern/kern_poll.c,v 1.2.2.4 2002/06/27 23:26:33 luigi Exp $
28 * $DragonFly: src/sys/kern/kern_poll.c,v 1.48 2008/09/24 12:07:19 sephe Exp $
29 */
30
31#include "opt_polling.h"
32
33#include <sys/param.h>
34#include <sys/kernel.h>
35#include <sys/ktr.h>
36#include <sys/socket.h> /* needed by net/if.h */
37#include <sys/sysctl.h>
38
39#include <sys/thread2.h>
40#include <sys/msgport2.h>
41
42#include <net/if.h> /* for IFF_* flags */
43#include <net/netmsg2.h>
44
45/*
46 * Polling support for [network] device drivers.
47 *
48 * Drivers which support this feature try to register with the
49 * polling code.
50 *
51 * If registration is successful, the driver must disable interrupts,
52 * and further I/O is performed through the handler, which is invoked
53 * (at least once per clock tick) with 3 arguments: the "arg" passed at
54 * register time (a struct ifnet pointer), a command, and a "count" limit.
55 *
56 * The command can be one of the following:
57 * POLL_ONLY: quick move of "count" packets from input/output queues.
58 * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
59 * other more expensive operations. This command is issued periodically
60 * but less frequently than POLL_ONLY.
61 * POLL_DEREGISTER: deregister and return to interrupt mode.
62 * POLL_REGISTER: register and disable interrupts
63 *
64 * The first two commands are only issued if the interface is marked as
65 * 'IFF_UP, IFF_RUNNING and IFF_POLLING', the last two only if IFF_RUNNING
66 * is set.
67 *
68 * The count limit specifies how much work the handler can do during the
69 * call -- typically this is the number of packets to be received, or
70 * transmitted, etc. (drivers are free to interpret this number, as long
71 * as the max time spent in the function grows roughly linearly with the
72 * count).
73 *
74 * Deregistration can be requested by the driver itself (typically in the
75 * *_stop() routine), or by the polling code, by invoking the handler.
76 *
77 * Polling can be enabled or disabled on particular CPU_X with the sysctl
78 * variable kern.polling.X.enable (default is 1, enabled)
79 *
80 * A second variable controls the sharing of CPU between polling/kernel
81 * network processing, and other activities (typically userlevel tasks):
82 * kern.polling.X.user_frac (between 0 and 100, default 50) sets the share
83 * of CPU allocated to user tasks. CPU is allocated proportionally to the
84 * shares, by dynamically adjusting the "count" (poll_burst).
85 *
86 * Other parameters can should be left to their default values.
87 * The following constraints hold
88 *
89 * 1 <= poll_burst <= poll_burst_max
90 * 1 <= poll_each_burst <= poll_burst_max
91 * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
92 */
93
94#define MIN_POLL_BURST_MAX 10
95#define MAX_POLL_BURST_MAX 1000
96#define POLL_BURST_MAX 150 /* good for 100Mbit net and HZ=1000 */
97#define POLL_EACH_BURST 5
98
99#ifndef DEVICE_POLLING_FREQ_MAX
100#define DEVICE_POLLING_FREQ_MAX 30000
101#endif
102#define DEVICE_POLLING_FREQ_DEFAULT 2000
103
104#define POLL_LIST_LEN 128
105struct pollrec {
106 struct ifnet *ifp;
107};
108
109#define POLLCTX_MAX 32
110
111struct pollctx {
112 struct sysctl_ctx_list poll_sysctl_ctx;
113 struct sysctl_oid *poll_sysctl_tree;
114
115 uint32_t poll_burst; /* state */
116 uint32_t poll_each_burst; /* tunable */
117 uint32_t poll_burst_max; /* tunable */
118 uint32_t user_frac; /* tunable */
119 int reg_frac_count; /* state */
120 uint32_t reg_frac; /* tunable */
121 uint32_t short_ticks; /* statistics */
122 uint32_t lost_polls; /* statistics */
123 uint32_t pending_polls; /* state */
124 int residual_burst; /* state */
125 uint32_t phase; /* state */
126 uint32_t suspect; /* statistics */
127 uint32_t stalled; /* statistics */
128 struct timeval poll_start_t; /* state */
129 struct timeval prev_t; /* state */
130
131 uint32_t poll_handlers; /* next free entry in pr[]. */
132 struct pollrec pr[POLL_LIST_LEN];
133
134 int poll_cpuid;
135 struct systimer pollclock;
136 int polling_enabled; /* tunable */
137 int pollhz; /* tunable */
138
139 struct netmsg poll_netmsg;
140 struct netmsg poll_more_netmsg;
141};
142
143static struct pollctx *poll_context[POLLCTX_MAX];
144
145SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
146 "Device polling parameters");
147
148static int poll_defcpu = -1;
149SYSCTL_INT(_kern_polling, OID_AUTO, defcpu, CTLFLAG_RD,
150 &poll_defcpu, 0, "default CPU to run device polling");
151
152static uint32_t poll_cpumask0 = 0xffffffff;
153TUNABLE_INT("kern.polling.cpumask", (int *)&poll_cpumask0);
154
155static uint32_t poll_cpumask;
156SYSCTL_INT(_kern_polling, OID_AUTO, cpumask, CTLFLAG_RD,
157 &poll_cpumask, 0, "CPUs that can run device polling");
158
159static int polling_enabled = 1; /* global polling enable */
160TUNABLE_INT("kern.polling.enable", &polling_enabled);
161
162static int pollhz = DEVICE_POLLING_FREQ_DEFAULT;
163TUNABLE_INT("kern.polling.pollhz", &pollhz);
164
165static int poll_burst_max = POLL_BURST_MAX;
166TUNABLE_INT("kern.polling.burst_max", &poll_burst_max);
167
168static int poll_each_burst = POLL_EACH_BURST;
169TUNABLE_INT("kern.polling.each_burst", &poll_each_burst);
170
171/* Netisr handlers */
172static void netisr_poll(struct netmsg *);
173static void netisr_pollmore(struct netmsg *);
174static void poll_register(struct netmsg *);
175static void poll_deregister(struct netmsg *);
176static void poll_sysctl_pollhz(struct netmsg *);
177static void poll_sysctl_polling(struct netmsg *);
178static void poll_sysctl_regfrac(struct netmsg *);
179static void poll_sysctl_burstmax(struct netmsg *);
180static void poll_sysctl_eachburst(struct netmsg *);
181
182/* Systimer handler */
183static void pollclock(systimer_t, struct intrframe *);
184
185/* Sysctl handlers */
186static int sysctl_pollhz(SYSCTL_HANDLER_ARGS);
187static int sysctl_polling(SYSCTL_HANDLER_ARGS);
188static int sysctl_regfrac(SYSCTL_HANDLER_ARGS);
189static int sysctl_burstmax(SYSCTL_HANDLER_ARGS);
190static int sysctl_eachburst(SYSCTL_HANDLER_ARGS);
191static void poll_add_sysctl(struct sysctl_ctx_list *,
192 struct sysctl_oid_list *, struct pollctx *);
193
194static void schedpoll_oncpu(struct netmsg *);
195
196void init_device_poll_pcpu(int); /* per-cpu init routine */
197
198#define POLL_KTR_STRING "ifp=%p"
199#define POLL_KTR_ARG_SIZE (sizeof(void *))
200
201#ifndef KTR_POLLING
202#define KTR_POLLING KTR_ALL
203#endif
204KTR_INFO_MASTER(poll);
205KTR_INFO(KTR_POLLING, poll, beg, 0, POLL_KTR_STRING, POLL_KTR_ARG_SIZE);
206KTR_INFO(KTR_POLLING, poll, end, 1, POLL_KTR_STRING, POLL_KTR_ARG_SIZE);
207
208#define logpoll(name, arg) KTR_LOG(poll_ ## name, arg)
209
210static __inline void
211poll_reset_state(struct pollctx *pctx)
212{
213 crit_enter();
214 pctx->poll_burst = 5;
215 pctx->reg_frac_count = 0;
216 pctx->pending_polls = 0;
217 pctx->residual_burst = 0;
218 pctx->phase = 0;
219 bzero(&pctx->poll_start_t, sizeof(pctx->poll_start_t));
220 bzero(&pctx->prev_t, sizeof(pctx->prev_t));
221 crit_exit();
222}
223
224/*
225 * Initialize per-cpu polling(4) context. Called from kern_clock.c:
226 */
227void
228init_device_poll_pcpu(int cpuid)
229{
230 struct pollctx *pctx;
231 char cpuid_str[3];
232
233 if (cpuid >= POLLCTX_MAX)
234 return;
235
236 if (((1 << cpuid) & poll_cpumask0) == 0)
237 return;
238
239 if (poll_burst_max < MIN_POLL_BURST_MAX)
240 poll_burst_max = MIN_POLL_BURST_MAX;
241 else if (poll_burst_max > MAX_POLL_BURST_MAX)
242 poll_burst_max = MAX_POLL_BURST_MAX;
243
244 if (poll_each_burst > poll_burst_max)
245 poll_each_burst = poll_burst_max;
246
247 poll_cpumask |= (1 << cpuid);
248
249 pctx = kmalloc(sizeof(*pctx), M_DEVBUF, M_WAITOK | M_ZERO);
250
251 pctx->poll_each_burst = poll_each_burst;
252 pctx->poll_burst_max = poll_burst_max;
253 pctx->user_frac = 50;
254 pctx->reg_frac = 20;
255 pctx->polling_enabled = polling_enabled;
256 pctx->pollhz = pollhz;
257 pctx->poll_cpuid = cpuid;
258 poll_reset_state(pctx);
259
260 netmsg_init(&pctx->poll_netmsg, &netisr_adone_rport, MSGF_MPSAFE,
261 netisr_poll);
262#ifdef INVARIANTS
263 pctx->poll_netmsg.nm_lmsg.u.ms_resultp = pctx;
264#endif
265
266 netmsg_init(&pctx->poll_more_netmsg, &netisr_adone_rport, MSGF_MPSAFE,
267 netisr_pollmore);
268#ifdef INVARIANTS
269 pctx->poll_more_netmsg.nm_lmsg.u.ms_resultp = pctx;
270#endif
271
272 KASSERT(cpuid < POLLCTX_MAX, ("cpu id must < %d", cpuid));
273 poll_context[cpuid] = pctx;
274
275 if (poll_defcpu < 0) {
276 poll_defcpu = cpuid;
277
278 /*
279 * Initialize global sysctl nodes, for compat
280 */
281 poll_add_sysctl(NULL, SYSCTL_STATIC_CHILDREN(_kern_polling),
282 pctx);
283 }
284
285 /*
286 * Initialize per-cpu sysctl nodes
287 */
288 ksnprintf(cpuid_str, sizeof(cpuid_str), "%d", pctx->poll_cpuid);
289
290 sysctl_ctx_init(&pctx->poll_sysctl_ctx);
291 pctx->poll_sysctl_tree = SYSCTL_ADD_NODE(&pctx->poll_sysctl_ctx,
292 SYSCTL_STATIC_CHILDREN(_kern_polling),
293 OID_AUTO, cpuid_str, CTLFLAG_RD, 0, "");
294 poll_add_sysctl(&pctx->poll_sysctl_ctx,
295 SYSCTL_CHILDREN(pctx->poll_sysctl_tree), pctx);
296
297 /*
298 * Initialize systimer
299 */
300 systimer_init_periodic_nq(&pctx->pollclock, pollclock, pctx, 1);
301}
302
303static void
304schedpoll_oncpu(struct netmsg *msg)
305{
306 if (msg->nm_lmsg.ms_flags & MSGF_DONE)
307 lwkt_sendmsg(cpu_portfn(mycpuid), &msg->nm_lmsg);
308}
309
310static __inline void
311schedpoll(struct pollctx *pctx)
312{
313 crit_enter();
314 schedpoll_oncpu(&pctx->poll_netmsg);
315 crit_exit();
316}
317
318static __inline void
319schedpollmore(struct pollctx *pctx)
320{
321 schedpoll_oncpu(&pctx->poll_more_netmsg);
322}
323
324/*
325 * Set the polling frequency
326 */
327static int
328sysctl_pollhz(SYSCTL_HANDLER_ARGS)
329{
330 struct pollctx *pctx = arg1;
331 struct netmsg msg;
332 lwkt_port_t port;
333 int error, phz;
334
335 phz = pctx->pollhz;
336 error = sysctl_handle_int(oidp, &phz, 0, req);
337 if (error || req->newptr == NULL)
338 return error;
339 if (phz <= 0)
340 return EINVAL;
341 else if (phz > DEVICE_POLLING_FREQ_MAX)
342 phz = DEVICE_POLLING_FREQ_MAX;
343
344 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE,
345 poll_sysctl_pollhz);
346 msg.nm_lmsg.u.ms_result = phz;
347
348 port = cpu_portfn(pctx->poll_cpuid);
349 lwkt_domsg(port, &msg.nm_lmsg, 0);
350 return 0;
351}
352
353/*
354 * Master enable.
355 */
356static int
357sysctl_polling(SYSCTL_HANDLER_ARGS)
358{
359 struct pollctx *pctx = arg1;
360 struct netmsg msg;
361 lwkt_port_t port;
362 int error, enabled;
363
364 enabled = pctx->polling_enabled;
365 error = sysctl_handle_int(oidp, &enabled, 0, req);
366 if (error || req->newptr == NULL)
367 return error;
368
369 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE,
370 poll_sysctl_polling);
371 msg.nm_lmsg.u.ms_result = enabled;
372
373 port = cpu_portfn(pctx->poll_cpuid);
374 lwkt_domsg(port, &msg.nm_lmsg, 0);
375 return 0;
376}
377
378static int
379sysctl_regfrac(SYSCTL_HANDLER_ARGS)
380{
381 struct pollctx *pctx = arg1;
382 struct netmsg msg;
383 lwkt_port_t port;
384 uint32_t reg_frac;
385 int error;
386
387 reg_frac = pctx->reg_frac;
388 error = sysctl_handle_int(oidp, &reg_frac, 0, req);
389 if (error || req->newptr == NULL)
390 return error;
391
392 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE,
393 poll_sysctl_regfrac);
394 msg.nm_lmsg.u.ms_result = reg_frac;
395
396 port = cpu_portfn(pctx->poll_cpuid);
397 lwkt_domsg(port, &msg.nm_lmsg, 0);
398 return 0;
399}
400
401static int
402sysctl_burstmax(SYSCTL_HANDLER_ARGS)
403{
404 struct pollctx *pctx = arg1;
405 struct netmsg msg;
406 lwkt_port_t port;
407 uint32_t burst_max;
408 int error;
409
410 burst_max = pctx->poll_burst_max;
411 error = sysctl_handle_int(oidp, &burst_max, 0, req);
412 if (error || req->newptr == NULL)
413 return error;
414 if (burst_max < MIN_POLL_BURST_MAX)
415 burst_max = MIN_POLL_BURST_MAX;
416 else if (burst_max > MAX_POLL_BURST_MAX)
417 burst_max = MAX_POLL_BURST_MAX;
418
419 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE,
420 poll_sysctl_burstmax);
421 msg.nm_lmsg.u.ms_result = burst_max;
422
423 port = cpu_portfn(pctx->poll_cpuid);
424 lwkt_domsg(port, &msg.nm_lmsg, 0);
425 return 0;
426}
427
428static int
429sysctl_eachburst(SYSCTL_HANDLER_ARGS)
430{
431 struct pollctx *pctx = arg1;
432 struct netmsg msg;
433 lwkt_port_t port;
434 uint32_t each_burst;
435 int error;
436
437 each_burst = pctx->poll_each_burst;
438 error = sysctl_handle_int(oidp, &each_burst, 0, req);
439 if (error || req->newptr == NULL)
440 return error;
441
442 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE,
443 poll_sysctl_eachburst);
444 msg.nm_lmsg.u.ms_result = each_burst;
445
446 port = cpu_portfn(pctx->poll_cpuid);
447 lwkt_domsg(port, &msg.nm_lmsg, 0);
448 return 0;
449}
450
451/*
452 * Hook from polling systimer. Tries to schedule a netisr, but keeps
453 * track of lost ticks due to the previous handler taking too long.
454 * Normally, this should not happen, because polling handler should
455 * run for a short time. However, in some cases (e.g. when there are
456 * changes in link status etc.) the drivers take a very long time
457 * (even in the order of milliseconds) to reset and reconfigure the
458 * device, causing apparent lost polls.
459 *
460 * The first part of the code is just for debugging purposes, and tries
461 * to count how often hardclock ticks are shorter than they should,
462 * meaning either stray interrupts or delayed events.
463 *
464 * WARNING! called from fastint or IPI, the MP lock might not be held.
465 */
466static void
467pollclock(systimer_t info, struct intrframe *frame __unused)
468{
469 struct pollctx *pctx = info->data;
470 struct timeval t;
471 int delta;
472
473 if (pctx->poll_handlers == 0)
474 return;
475
476 microuptime(&t);
477 delta = (t.tv_usec - pctx->prev_t.tv_usec) +
478 (t.tv_sec - pctx->prev_t.tv_sec)*1000000;
479 if (delta * pctx->pollhz < 500000)
480 pctx->short_ticks++;
481 else
482 pctx->prev_t = t;
483
484 if (pctx->pending_polls > 100) {
485 /*
486 * Too much, assume it has stalled (not always true
487 * see comment above).
488 */
489 pctx->stalled++;
490 pctx->pending_polls = 0;
491 pctx->phase = 0;
492 }
493
494 if (pctx->phase <= 2) {
495 if (pctx->phase != 0)
496 pctx->suspect++;
497 pctx->phase = 1;
498 schedpoll(pctx);
499 pctx->phase = 2;
500 }
501 if (pctx->pending_polls++ > 0)
502 pctx->lost_polls++;
503}
504
505/*
506 * netisr_pollmore is called after other netisr's, possibly scheduling
507 * another NETISR_POLL call, or adapting the burst size for the next cycle.
508 *
509 * It is very bad to fetch large bursts of packets from a single card at once,
510 * because the burst could take a long time to be completely processed leading
511 * to unfairness. To reduce the problem, and also to account better for time
512 * spent in network-related processing, we split the burst in smaller chunks
513 * of fixed size, giving control to the other netisr's between chunks. This
514 * helps in improving the fairness, reducing livelock (because we emulate more
515 * closely the "process to completion" that we have with fastforwarding) and
516 * accounting for the work performed in low level handling and forwarding.
517 */
518
519/* ARGSUSED */
520static void
521netisr_pollmore(struct netmsg *msg)
522{
523 struct pollctx *pctx;
524 struct timeval t;
525 int kern_load, cpuid;
526 uint32_t pending_polls;
527
528 cpuid = mycpu->gd_cpuid;
529 KKASSERT(cpuid < POLLCTX_MAX);
530
531 pctx = poll_context[cpuid];
532 KKASSERT(pctx != NULL);
533 KKASSERT(pctx->poll_cpuid == cpuid);
534 KKASSERT(pctx == msg->nm_lmsg.u.ms_resultp);
535
536 lwkt_replymsg(&msg->nm_lmsg, 0);
537
538 if (pctx->poll_handlers == 0)
539 return;
540
541 KASSERT(pctx->polling_enabled,
542 ("# of registered poll handlers are not zero, "
543 "but polling is not enabled\n"));
544
545 pctx->phase = 5;
546 if (pctx->residual_burst > 0) {
547 schedpoll(pctx);
548 /* will run immediately on return, followed by netisrs */
549 return;
550 }
551 /* here we can account time spent in netisr's in this tick */
552 microuptime(&t);
553 kern_load = (t.tv_usec - pctx->poll_start_t.tv_usec) +
554 (t.tv_sec - pctx->poll_start_t.tv_sec)*1000000; /* us */
555 kern_load = (kern_load * pctx->pollhz) / 10000; /* 0..100 */
556 if (kern_load > (100 - pctx->user_frac)) { /* try decrease ticks */
557 if (pctx->poll_burst > 1)
558 pctx->poll_burst--;
559 } else {
560 if (pctx->poll_burst < pctx->poll_burst_max)
561 pctx->poll_burst++;
562 }
563
564 crit_enter();
565 pctx->pending_polls--;
566 pending_polls = pctx->pending_polls;
567 crit_exit();
568
569 if (pending_polls == 0) { /* we are done */
570 pctx->phase = 0;
571 } else {
572 /*
573 * Last cycle was long and caused us to miss one or more
574 * hardclock ticks. Restart processing again, but slightly
575 * reduce the burst size to prevent that this happens again.
576 */
577 pctx->poll_burst -= (pctx->poll_burst / 8);
578 if (pctx->poll_burst < 1)
579 pctx->poll_burst = 1;
580 schedpoll(pctx);
581 pctx->phase = 6;
582 }
583}
584
585/*
586 * netisr_poll is scheduled by schedpoll when appropriate, typically once
587 * per polling systimer tick.
588 *
589 * Note that the message is replied immediately in order to allow a new
590 * ISR to be scheduled in the handler.
591 *
592 * XXX each registration should indicate whether it needs a critical
593 * section to operate.
594 */
595/* ARGSUSED */
596static void
597netisr_poll(struct netmsg *msg)
598{
599 struct pollctx *pctx;
600 int i, cycles, cpuid;
601 enum poll_cmd arg = POLL_ONLY;
602
603 cpuid = mycpu->gd_cpuid;
604 KKASSERT(cpuid < POLLCTX_MAX);
605
606 pctx = poll_context[cpuid];
607 KKASSERT(pctx != NULL);
608 KKASSERT(pctx->poll_cpuid == cpuid);
609 KKASSERT(pctx == msg->nm_lmsg.u.ms_resultp);
610
611 crit_enter();
612 lwkt_replymsg(&msg->nm_lmsg, 0);
613 crit_exit();
614
615 if (pctx->poll_handlers == 0)
616 return;
617
618 KASSERT(pctx->polling_enabled,
619 ("# of registered poll handlers are not zero, "
620 "but polling is not enabled\n"));
621
622 pctx->phase = 3;
623 if (pctx->residual_burst == 0) { /* first call in this tick */
624 microuptime(&pctx->poll_start_t);
625
626 if (pctx->reg_frac_count-- == 0) {
627 arg = POLL_AND_CHECK_STATUS;
628 pctx->reg_frac_count = pctx->reg_frac - 1;
629 }
630
631 pctx->residual_burst = pctx->poll_burst;
632 }
633 cycles = (pctx->residual_burst < pctx->poll_each_burst) ?
634 pctx->residual_burst : pctx->poll_each_burst;
635 pctx->residual_burst -= cycles;
636
637 for (i = 0 ; i < pctx->poll_handlers ; i++) {
638 struct ifnet *ifp = pctx->pr[i].ifp;
639
640 if (!lwkt_serialize_try(ifp->if_serializer))
641 continue;
642
643 if ((ifp->if_flags & (IFF_UP|IFF_RUNNING|IFF_POLLING))
644 == (IFF_UP|IFF_RUNNING|IFF_POLLING)) {
645 logpoll(beg, ifp);
646 crit_enter();
647 ifp->if_poll(ifp, arg, cycles);
648 crit_exit();
649 logpoll(end, ifp);
650 }
651
652 lwkt_serialize_exit(ifp->if_serializer);
653 }
654
655 schedpollmore(pctx);
656 pctx->phase = 4;
657}
658
659static void
660poll_register(struct netmsg *msg)
661{
662 struct ifnet *ifp = msg->nm_lmsg.u.ms_resultp;
663 struct pollctx *pctx;
664 int rc, cpuid;
665
666 cpuid = mycpu->gd_cpuid;
667 KKASSERT(cpuid < POLLCTX_MAX);
668
669 pctx = poll_context[cpuid];
670 KKASSERT(pctx != NULL);
671 KKASSERT(pctx->poll_cpuid == cpuid);
672
673 if (pctx->polling_enabled == 0) {
674 /* Polling disabled, cannot register */
675 rc = EOPNOTSUPP;
676 goto back;
677 }
678
679 /*
680 * Check if there is room.
681 */
682 if (pctx->poll_handlers >= POLL_LIST_LEN) {
683 /*
684 * List full, cannot register more entries.
685 * This should never happen; if it does, it is probably a
686 * broken driver trying to register multiple times. Checking
687 * this at runtime is expensive, and won't solve the problem
688 * anyways, so just report a few times and then give up.
689 */
690 static int verbose = 10; /* XXX */
691 if (verbose >0) {
692 kprintf("poll handlers list full, "
693 "maybe a broken driver ?\n");
694 verbose--;
695 }
696 rc = ENOMEM;
697 } else {
698 pctx->pr[pctx->poll_handlers].ifp = ifp;
699 pctx->poll_handlers++;
700 rc = 0;
701
702 if (pctx->poll_handlers == 1) {
703 KKASSERT(pctx->polling_enabled);
704 systimer_adjust_periodic(&pctx->pollclock,
705 pctx->pollhz);
706 }
707 }
708back:
709 lwkt_replymsg(&msg->nm_lmsg, rc);
710}
711
712/*
713 * Try to register routine for polling. Returns 1 if successful
714 * (and polling should be enabled), 0 otherwise.
715 *
716 * Called from mainline code only, not called from an interrupt.
717 */
718int
719ether_poll_register(struct ifnet *ifp)
720{
721 if (poll_defcpu < 0)
722 return 0;
723 KKASSERT(poll_defcpu < POLLCTX_MAX);
724
725 return ether_pollcpu_register(ifp, poll_defcpu);
726}
727
728int
729ether_pollcpu_register(struct ifnet *ifp, int cpuid)
730{
731 struct netmsg msg;
732 lwkt_port_t port;
733 int rc;
734
735 if (ifp->if_poll == NULL) {
736 /* Device does not support polling */
737 return 0;
738 }
739
740 if (cpuid < 0 || cpuid >= POLLCTX_MAX)
741 return 0;
742
743 if (((1 << cpuid) & poll_cpumask) == 0) {
744 /* Polling is not supported on 'cpuid' */
745 return 0;
746 }
747 KKASSERT(poll_context[cpuid] != NULL);
748
749 /*
750 * Attempt to register. Interlock with IFF_POLLING.
751 */
752 crit_enter(); /* XXX MP - not mp safe */
753
754 lwkt_serialize_enter(ifp->if_serializer);
755 if (ifp->if_flags & IFF_POLLING) {
756 /* Already polling */
757 KKASSERT(ifp->if_poll_cpuid >= 0);
758 lwkt_serialize_exit(ifp->if_serializer);
759 crit_exit();
760 return 0;
761 }
762 KKASSERT(ifp->if_poll_cpuid < 0);
763 ifp->if_flags |= IFF_POLLING;
764 ifp->if_poll_cpuid = cpuid;
765 if (ifp->if_flags & IFF_RUNNING)
766 ifp->if_poll(ifp, POLL_REGISTER, 0);
767 lwkt_serialize_exit(ifp->if_serializer);
768
769 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE, poll_register);
770 msg.nm_lmsg.u.ms_resultp = ifp;
771
772 port = cpu_portfn(cpuid);
773 lwkt_domsg(port, &msg.nm_lmsg, 0);
774
775 if (msg.nm_lmsg.ms_error) {
776 lwkt_serialize_enter(ifp->if_serializer);
777 ifp->if_flags &= ~IFF_POLLING;
778 ifp->if_poll_cpuid = -1;
779 if (ifp->if_flags & IFF_RUNNING)
780 ifp->if_poll(ifp, POLL_DEREGISTER, 0);
781 lwkt_serialize_exit(ifp->if_serializer);
782 rc = 0;
783 } else {
784 rc = 1;
785 }
786
787 crit_exit();
788 return rc;
789}
790
791static void
792poll_deregister(struct netmsg *msg)
793{
794 struct ifnet *ifp = msg->nm_lmsg.u.ms_resultp;
795 struct pollctx *pctx;
796 int rc, i, cpuid;
797
798 cpuid = mycpu->gd_cpuid;
799 KKASSERT(cpuid < POLLCTX_MAX);
800
801 pctx = poll_context[cpuid];
802 KKASSERT(pctx != NULL);
803 KKASSERT(pctx->poll_cpuid == cpuid);
804
805 for (i = 0 ; i < pctx->poll_handlers ; i++) {
806 if (pctx->pr[i].ifp == ifp) /* Found it */
807 break;
808 }
809 if (i == pctx->poll_handlers) {
810 kprintf("ether_poll_deregister: ifp not found!!!\n");
811 rc = ENOENT;
812 } else {
813 pctx->poll_handlers--;
814 if (i < pctx->poll_handlers) {
815 /* Last entry replaces this one. */
816 pctx->pr[i].ifp = pctx->pr[pctx->poll_handlers].ifp;
817 }
818
819 if (pctx->poll_handlers == 0) {
820 systimer_adjust_periodic(&pctx->pollclock, 1);
821 poll_reset_state(pctx);
822 }
823 rc = 0;
824 }
825 lwkt_replymsg(&msg->nm_lmsg, rc);
826}
827
828/*
829 * Remove interface from the polling list. Occurs when polling is turned
830 * off. Called from mainline code only, not called from an interrupt.
831 */
832int
833ether_poll_deregister(struct ifnet *ifp)
834{
835 struct netmsg msg;
836 lwkt_port_t port;
837 int rc, cpuid;
838
839 KKASSERT(ifp != NULL);
840
841 if (ifp->if_poll == NULL)
842 return 0;
843
844 crit_enter();
845
846 lwkt_serialize_enter(ifp->if_serializer);
847 if ((ifp->if_flags & IFF_POLLING) == 0) {
848 KKASSERT(ifp->if_poll_cpuid < 0);
849 lwkt_serialize_exit(ifp->if_serializer);
850 crit_exit();
851 return 0;
852 }
853
854 cpuid = ifp->if_poll_cpuid;
855 KKASSERT(cpuid >= 0);
856 KKASSERT(poll_context[cpuid] != NULL);
857
858 ifp->if_flags &= ~IFF_POLLING;
859 ifp->if_poll_cpuid = -1;
860 lwkt_serialize_exit(ifp->if_serializer);
861
862 netmsg_init(&msg, &curthread->td_msgport, MSGF_MPSAFE, poll_deregister);
863 msg.nm_lmsg.u.ms_resultp = ifp;
864
865 port = cpu_portfn(cpuid);
866 lwkt_domsg(port, &msg.nm_lmsg, 0);
867
868 if (!msg.nm_lmsg.ms_error) {
869 lwkt_serialize_enter(ifp->if_serializer);
870 if (ifp->if_flags & IFF_RUNNING)
871 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
872 lwkt_serialize_exit(ifp->if_serializer);
873 rc = 1;
874 } else {
875 rc = 0;
876 }
877
878 crit_exit();
879 return rc;
880}
881
882static void
883poll_add_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent,
884 struct pollctx *pctx)
885{
886 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "enable",
887 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_polling,
888 "I", "Polling enabled");
889
890 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "pollhz",
891 CTLTYPE_INT | CTLFLAG_RW, pctx, 0, sysctl_pollhz,
892 "I", "Device polling frequency");
893
894 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "reg_frac",
895 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_regfrac,
896 "IU", "Every this many cycles poll register");
897
898 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "burst_max",
899 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_burstmax,
900 "IU", "Max Polling burst size");
901
902 SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "each_burst",
903 CTLTYPE_UINT | CTLFLAG_RW, pctx, 0, sysctl_eachburst,
904 "IU", "Max size of each burst");
905
906 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "phase", CTLFLAG_RD,
907 &pctx->phase, 0, "Polling phase");
908
909 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "suspect", CTLFLAG_RW,
910 &pctx->suspect, 0, "suspect event");
911
912 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "stalled", CTLFLAG_RW,
913 &pctx->stalled, 0, "potential stalls");
914
915 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "burst", CTLFLAG_RD,
916 &pctx->poll_burst, 0, "Current polling burst size");
917
918 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "user_frac", CTLFLAG_RW,
919 &pctx->user_frac, 0,
920 "Desired user fraction of cpu time");
921
922 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "short_ticks", CTLFLAG_RW,
923 &pctx->short_ticks, 0,
924 "Hardclock ticks shorter than they should be");
925
926 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "lost_polls", CTLFLAG_RW,
927 &pctx->lost_polls, 0,
928 "How many times we would have lost a poll tick");
929
930 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "pending_polls", CTLFLAG_RD,
931 &pctx->pending_polls, 0, "Do we need to poll again");
932
933 SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "residual_burst", CTLFLAG_RD,
934 &pctx->residual_burst, 0,
935 "# of residual cycles in burst");
936
937 SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "handlers", CTLFLAG_RD,
938 &pctx->poll_handlers, 0,
939 "Number of registered poll handlers");
940}
941
942static void
943poll_sysctl_pollhz(struct netmsg *msg)
944{
945 struct pollctx *pctx;
946 int cpuid;
947
948 cpuid = mycpu->gd_cpuid;
949 KKASSERT(cpuid < POLLCTX_MAX);
950
951 pctx = poll_context[cpuid];
952 KKASSERT(pctx != NULL);
953 KKASSERT(pctx->poll_cpuid == cpuid);
954
955 /*
956 * If polling is disabled or there is no device registered,
957 * don't adjust polling systimer frequency.
958 * Polling systimer frequency will be adjusted once polling
959 * is enabled and there are registered devices.
960 */
961 pctx->pollhz = msg->nm_lmsg.u.ms_result;
962 if (pctx->polling_enabled && pctx->poll_handlers)
963 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
964
965 /*
966 * Make sure that reg_frac and reg_frac_count are within valid range.
967 */
968 if (pctx->reg_frac > pctx->pollhz) {
969 pctx->reg_frac = pctx->pollhz;
970 if (pctx->reg_frac_count > pctx->reg_frac)
971 pctx->reg_frac_count = pctx->reg_frac - 1;
972 }
973
974 lwkt_replymsg(&msg->nm_lmsg, 0);
975}
976
977static void
978poll_sysctl_polling(struct netmsg *msg)
979{
980 struct pollctx *pctx;
981 int cpuid;
982
983 cpuid = mycpu->gd_cpuid;
984 KKASSERT(cpuid < POLLCTX_MAX);
985
986 pctx = poll_context[cpuid];
987 KKASSERT(pctx != NULL);
988 KKASSERT(pctx->poll_cpuid == cpuid);
989
990 /*
991 * If polling is disabled or there is no device registered,
992 * cut the polling systimer frequency to 1hz.
993 */
994 pctx->polling_enabled = msg->nm_lmsg.u.ms_result;
995 if (pctx->polling_enabled && pctx->poll_handlers) {
996 systimer_adjust_periodic(&pctx->pollclock, pctx->pollhz);
997 } else {
998 systimer_adjust_periodic(&pctx->pollclock, 1);
999 poll_reset_state(pctx);
1000 }
1001
1002 if (!pctx->polling_enabled && pctx->poll_handlers != 0) {
1003 int i;
1004
1005 for (i = 0 ; i < pctx->poll_handlers ; i++) {
1006 struct ifnet *ifp = pctx->pr[i].ifp;
1007
1008 lwkt_serialize_enter(ifp->if_serializer);
1009
1010 if ((ifp->if_flags & IFF_POLLING) == 0) {
1011 KKASSERT(ifp->if_poll_cpuid < 0);
1012 lwkt_serialize_exit(ifp->if_serializer);
1013 continue;
1014 }
1015 ifp->if_flags &= ~IFF_POLLING;
1016 ifp->if_poll_cpuid = -1;
1017
1018 /*
1019 * Only call the interface deregistration
1020 * function if the interface is still
1021 * running.
1022 */
1023 if (ifp->if_flags & IFF_RUNNING)
1024 ifp->if_poll(ifp, POLL_DEREGISTER, 1);
1025
1026 lwkt_serialize_exit(ifp->if_serializer);
1027 }
1028 pctx->poll_handlers = 0;
1029 }
1030
1031 lwkt_replymsg(&msg->nm_lmsg, 0);
1032}
1033
1034static void
1035poll_sysctl_regfrac(struct netmsg *msg)
1036{
1037 struct pollctx *pctx;
1038 uint32_t reg_frac;
1039 int cpuid;
1040
1041 cpuid = mycpu->gd_cpuid;
1042 KKASSERT(cpuid < POLLCTX_MAX);
1043
1044 pctx = poll_context[cpuid];
1045 KKASSERT(pctx != NULL);
1046 KKASSERT(pctx->poll_cpuid == cpuid);
1047
1048 reg_frac = msg->nm_lmsg.u.ms_result;
1049 if (reg_frac > pctx->pollhz)
1050 reg_frac = pctx->pollhz;
1051 else if (reg_frac < 1)
1052 reg_frac = 1;
1053
1054 pctx->reg_frac = reg_frac;
1055 if (pctx->reg_frac_count > pctx->reg_frac)
1056 pctx->reg_frac_count = pctx->reg_frac - 1;
1057
1058 lwkt_replymsg(&msg->nm_lmsg, 0);
1059}
1060
1061static void
1062poll_sysctl_burstmax(struct netmsg *msg)
1063{
1064 struct pollctx *pctx;
1065 int cpuid;
1066
1067 cpuid = mycpu->gd_cpuid;
1068 KKASSERT(cpuid < POLLCTX_MAX);
1069
1070 pctx = poll_context[cpuid];
1071 KKASSERT(pctx != NULL);
1072 KKASSERT(pctx->poll_cpuid == cpuid);
1073
1074 pctx->poll_burst_max = msg->nm_lmsg.u.ms_result;
1075 if (pctx->poll_each_burst > pctx->poll_burst_max)
1076 pctx->poll_each_burst = pctx->poll_burst_max;
1077 if (pctx->poll_burst > pctx->poll_burst_max)
1078 pctx->poll_burst = pctx->poll_burst_max;
1079 if (pctx->residual_burst > pctx->poll_burst_max)
1080 pctx->residual_burst = pctx->poll_burst_max;
1081
1082 lwkt_replymsg(&msg->nm_lmsg, 0);
1083}
1084
1085static void
1086poll_sysctl_eachburst(struct netmsg *msg)
1087{
1088 struct pollctx *pctx;
1089 uint32_t each_burst;
1090 int cpuid;
1091
1092 cpuid = mycpu->gd_cpuid;
1093 KKASSERT(cpuid < POLLCTX_MAX);
1094
1095 pctx = poll_context[cpuid];
1096 KKASSERT(pctx != NULL);
1097 KKASSERT(pctx->poll_cpuid == cpuid);
1098
1099 each_burst = msg->nm_lmsg.u.ms_result;
1100 if (each_burst > pctx->poll_burst_max)
1101 each_burst = pctx->poll_burst_max;
1102 else if (each_burst < 1)
1103 each_burst = 1;
1104 pctx->poll_each_burst = each_burst;
1105
1106 lwkt_replymsg(&msg->nm_lmsg, 0);
1107}