| 1 | /* |
| 2 | * Copyright (c) 2003, 2004 Matthew Dillon. All rights reserved. |
| 3 | * Copyright (c) 2003, 2004 Jeffrey M. Hsu. All rights reserved. |
| 4 | * Copyright (c) 2003 Jonathan Lemon. All rights reserved. |
| 5 | * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. |
| 6 | * |
| 7 | * This code is derived from software contributed to The DragonFly Project |
| 8 | * by Jonathan Lemon, Jeffrey M. Hsu, and Matthew Dillon. |
| 9 | * |
| 10 | * Jonathan Lemon gave Jeffrey Hsu permission to combine his copyright |
| 11 | * into this one around July 8 2004. |
| 12 | * |
| 13 | * Redistribution and use in source and binary forms, with or without |
| 14 | * modification, are permitted provided that the following conditions |
| 15 | * are met: |
| 16 | * 1. Redistributions of source code must retain the above copyright |
| 17 | * notice, this list of conditions and the following disclaimer. |
| 18 | * 2. Redistributions in binary form must reproduce the above copyright |
| 19 | * notice, this list of conditions and the following disclaimer in the |
| 20 | * documentation and/or other materials provided with the distribution. |
| 21 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 22 | * contributors may be used to endorse or promote products derived |
| 23 | * from this software without specific, prior written permission. |
| 24 | * |
| 25 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 26 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 27 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 28 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 29 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 30 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 31 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 32 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 33 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 34 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 35 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 36 | * SUCH DAMAGE. |
| 37 | * |
| 38 | * $DragonFly: src/sys/net/netisr.c,v 1.35 2007/07/10 20:24:57 dillon Exp $ |
| 39 | */ |
| 40 | |
| 41 | #include <sys/param.h> |
| 42 | #include <sys/systm.h> |
| 43 | #include <sys/kernel.h> |
| 44 | #include <sys/malloc.h> |
| 45 | #include <sys/msgport.h> |
| 46 | #include <sys/proc.h> |
| 47 | #include <sys/interrupt.h> |
| 48 | #include <sys/socket.h> |
| 49 | #include <sys/sysctl.h> |
| 50 | #include <net/if.h> |
| 51 | #include <net/if_var.h> |
| 52 | #include <net/netisr.h> |
| 53 | #include <machine/cpufunc.h> |
| 54 | |
| 55 | #include <sys/thread2.h> |
| 56 | #include <sys/msgport2.h> |
| 57 | #include <net/netmsg2.h> |
| 58 | |
| 59 | static void netmsg_sync_func(struct netmsg *msg); |
| 60 | |
| 61 | struct netmsg_port_registration { |
| 62 | TAILQ_ENTRY(netmsg_port_registration) npr_entry; |
| 63 | lwkt_port_t npr_port; |
| 64 | }; |
| 65 | |
| 66 | static struct netisr netisrs[NETISR_MAX]; |
| 67 | static TAILQ_HEAD(,netmsg_port_registration) netreglist; |
| 68 | |
| 69 | /* Per-CPU thread to handle any protocol. */ |
| 70 | struct thread netisr_cpu[MAXCPU]; |
| 71 | lwkt_port netisr_afree_rport; |
| 72 | lwkt_port netisr_adone_rport; |
| 73 | lwkt_port netisr_apanic_rport; |
| 74 | lwkt_port netisr_sync_port; |
| 75 | |
| 76 | static int (*netmsg_fwd_port_fn)(lwkt_port_t, lwkt_msg_t); |
| 77 | |
| 78 | /* |
| 79 | * netisr_afree_rport replymsg function, only used to handle async |
| 80 | * messages which the sender has abandoned to their fate. |
| 81 | */ |
| 82 | static void |
| 83 | netisr_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) |
| 84 | { |
| 85 | kfree(msg, M_LWKTMSG); |
| 86 | } |
| 87 | |
| 88 | /* |
| 89 | * We need a custom putport function to handle the case where the |
| 90 | * message target is the current thread's message port. This case |
| 91 | * can occur when the TCP or UDP stack does a direct callback to NFS and NFS |
| 92 | * then turns around and executes a network operation synchronously. |
| 93 | * |
| 94 | * To prevent deadlocking, we must execute these self-referential messages |
| 95 | * synchronously, effectively turning the message into a glorified direct |
| 96 | * procedure call back into the protocol stack. The operation must be |
| 97 | * complete on return or we will deadlock, so panic if it isn't. |
| 98 | */ |
| 99 | static int |
| 100 | netmsg_put_port(lwkt_port_t port, lwkt_msg_t lmsg) |
| 101 | { |
| 102 | netmsg_t netmsg = (void *)lmsg; |
| 103 | |
| 104 | if ((lmsg->ms_flags & MSGF_SYNC) && port == &curthread->td_msgport) { |
| 105 | netmsg->nm_dispatch(netmsg); |
| 106 | if ((lmsg->ms_flags & MSGF_DONE) == 0) |
| 107 | panic("netmsg_put_port: self-referential deadlock on netport"); |
| 108 | return(EASYNC); |
| 109 | } else { |
| 110 | return(netmsg_fwd_port_fn(port, lmsg)); |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | /* |
| 115 | * UNIX DOMAIN sockets still have to run their uipc functions synchronously, |
| 116 | * because they depend on the user proc context for a number of things |
| 117 | * (like creds) which we have not yet incorporated into the message structure. |
| 118 | * |
| 119 | * However, we maintain or message/port abstraction. Having a special |
| 120 | * synchronous port which runs the commands synchronously gives us the |
| 121 | * ability to serialize operations in one place later on when we start |
| 122 | * removing the BGL. |
| 123 | */ |
| 124 | static int |
| 125 | netmsg_sync_putport(lwkt_port_t port, lwkt_msg_t lmsg) |
| 126 | { |
| 127 | netmsg_t netmsg = (void *)lmsg; |
| 128 | |
| 129 | KKASSERT((lmsg->ms_flags & MSGF_DONE) == 0); |
| 130 | |
| 131 | lmsg->ms_target_port = port; /* required for abort */ |
| 132 | netmsg->nm_dispatch(netmsg); |
| 133 | return(EASYNC); |
| 134 | } |
| 135 | |
| 136 | static void |
| 137 | netisr_init(void) |
| 138 | { |
| 139 | int i; |
| 140 | |
| 141 | TAILQ_INIT(&netreglist); |
| 142 | |
| 143 | /* |
| 144 | * Create default per-cpu threads for generic protocol handling. |
| 145 | */ |
| 146 | for (i = 0; i < ncpus; ++i) { |
| 147 | lwkt_create(netmsg_service_loop, NULL, NULL, &netisr_cpu[i], 0, i, |
| 148 | "netisr_cpu %d", i); |
| 149 | netmsg_service_port_init(&netisr_cpu[i].td_msgport); |
| 150 | } |
| 151 | |
| 152 | /* |
| 153 | * The netisr_afree_rport is a special reply port which automatically |
| 154 | * frees the replied message. The netisr_adone_rport simply marks |
| 155 | * the message as being done. The netisr_apanic_rport panics if |
| 156 | * the message is replied to. |
| 157 | */ |
| 158 | lwkt_initport_replyonly(&netisr_afree_rport, netisr_autofree_reply); |
| 159 | lwkt_initport_replyonly_null(&netisr_adone_rport); |
| 160 | lwkt_initport_panic(&netisr_apanic_rport); |
| 161 | |
| 162 | /* |
| 163 | * The netisr_syncport is a special port which executes the message |
| 164 | * synchronously and waits for it if EASYNC is returned. |
| 165 | */ |
| 166 | lwkt_initport_putonly(&netisr_sync_port, netmsg_sync_putport); |
| 167 | } |
| 168 | |
| 169 | SYSINIT(netisr, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, netisr_init, NULL); |
| 170 | |
| 171 | /* |
| 172 | * Finish initializing the message port for a netmsg service. This also |
| 173 | * registers the port for synchronous cleanup operations such as when an |
| 174 | * ifnet is being destroyed. There is no deregistration API yet. |
| 175 | */ |
| 176 | void |
| 177 | netmsg_service_port_init(lwkt_port_t port) |
| 178 | { |
| 179 | struct netmsg_port_registration *reg; |
| 180 | |
| 181 | /* |
| 182 | * Override the putport function. Our custom function checks for |
| 183 | * self-references and executes such commands synchronously. |
| 184 | */ |
| 185 | if (netmsg_fwd_port_fn == NULL) |
| 186 | netmsg_fwd_port_fn = port->mp_putport; |
| 187 | KKASSERT(netmsg_fwd_port_fn == port->mp_putport); |
| 188 | port->mp_putport = netmsg_put_port; |
| 189 | |
| 190 | /* |
| 191 | * Keep track of ports using the netmsg API so we can synchronize |
| 192 | * certain operations (such as freeing an ifnet structure) across all |
| 193 | * consumers. |
| 194 | */ |
| 195 | reg = kmalloc(sizeof(*reg), M_TEMP, M_WAITOK|M_ZERO); |
| 196 | reg->npr_port = port; |
| 197 | TAILQ_INSERT_TAIL(&netreglist, reg, npr_entry); |
| 198 | } |
| 199 | |
| 200 | /* |
| 201 | * This function synchronizes the caller with all netmsg services. For |
| 202 | * example, if an interface is being removed we must make sure that all |
| 203 | * packets related to that interface complete processing before the structure |
| 204 | * can actually be freed. This sort of synchronization is an alternative to |
| 205 | * ref-counting the netif, removing the ref counting overhead in favor of |
| 206 | * placing additional overhead in the netif freeing sequence (where it is |
| 207 | * inconsequential). |
| 208 | */ |
| 209 | void |
| 210 | netmsg_service_sync(void) |
| 211 | { |
| 212 | struct netmsg_port_registration *reg; |
| 213 | struct netmsg smsg; |
| 214 | |
| 215 | netmsg_init(&smsg, &curthread->td_msgport, 0, netmsg_sync_func); |
| 216 | |
| 217 | TAILQ_FOREACH(reg, &netreglist, npr_entry) { |
| 218 | lwkt_domsg(reg->npr_port, &smsg.nm_lmsg, 0); |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | /* |
| 223 | * The netmsg function simply replies the message. API semantics require |
| 224 | * EASYNC to be returned if the netmsg function disposes of the message. |
| 225 | */ |
| 226 | static void |
| 227 | netmsg_sync_func(struct netmsg *msg) |
| 228 | { |
| 229 | lwkt_replymsg(&msg->nm_lmsg, 0); |
| 230 | } |
| 231 | |
| 232 | /* |
| 233 | * Generic netmsg service loop. Some protocols may roll their own but all |
| 234 | * must do the basic command dispatch function call done here. |
| 235 | */ |
| 236 | void |
| 237 | netmsg_service_loop(void *arg) |
| 238 | { |
| 239 | struct netmsg *msg; |
| 240 | |
| 241 | while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) { |
| 242 | msg->nm_dispatch(msg); |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | /* |
| 247 | * Call the netisr directly. |
| 248 | * Queueing may be done in the msg port layer at its discretion. |
| 249 | */ |
| 250 | void |
| 251 | netisr_dispatch(int num, struct mbuf *m) |
| 252 | { |
| 253 | /* just queue it for now XXX JH */ |
| 254 | netisr_queue(num, m); |
| 255 | } |
| 256 | |
| 257 | /* |
| 258 | * Same as netisr_dispatch(), but always queue. |
| 259 | * This is either used in places where we are not confident that |
| 260 | * direct dispatch is possible, or where queueing is required. |
| 261 | */ |
| 262 | int |
| 263 | netisr_queue(int num, struct mbuf *m) |
| 264 | { |
| 265 | struct netisr *ni; |
| 266 | struct netmsg_packet *pmsg; |
| 267 | lwkt_port_t port; |
| 268 | |
| 269 | KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), |
| 270 | ("netisr_queue: bad isr %d", num)); |
| 271 | |
| 272 | ni = &netisrs[num]; |
| 273 | if (ni->ni_handler == NULL) { |
| 274 | kprintf("netisr_queue: unregistered isr %d\n", num); |
| 275 | return (EIO); |
| 276 | } |
| 277 | |
| 278 | if ((port = ni->ni_mport(&m)) == NULL) |
| 279 | return (EIO); |
| 280 | |
| 281 | pmsg = &m->m_hdr.mh_netmsg; |
| 282 | |
| 283 | netmsg_init(&pmsg->nm_netmsg, &netisr_apanic_rport, 0, ni->ni_handler); |
| 284 | pmsg->nm_packet = m; |
| 285 | pmsg->nm_netmsg.nm_lmsg.u.ms_result = num; |
| 286 | lwkt_sendmsg(port, &pmsg->nm_netmsg.nm_lmsg); |
| 287 | return (0); |
| 288 | } |
| 289 | |
| 290 | void |
| 291 | netisr_register(int num, lwkt_portfn_t mportfn, netisr_fn_t handler) |
| 292 | { |
| 293 | KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), |
| 294 | ("netisr_register: bad isr %d", num)); |
| 295 | netmsg_init(&netisrs[num].ni_netmsg, &netisr_adone_rport, 0, NULL); |
| 296 | netisrs[num].ni_mport = mportfn; |
| 297 | netisrs[num].ni_handler = handler; |
| 298 | } |
| 299 | |
| 300 | int |
| 301 | netisr_unregister(int num) |
| 302 | { |
| 303 | KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), |
| 304 | ("unregister_netisr: bad isr number: %d\n", num)); |
| 305 | |
| 306 | /* XXX JH */ |
| 307 | return (0); |
| 308 | } |
| 309 | |
| 310 | /* |
| 311 | * Return message port for default handler thread on CPU 0. |
| 312 | */ |
| 313 | lwkt_port_t |
| 314 | cpu0_portfn(struct mbuf **mptr) |
| 315 | { |
| 316 | return (&netisr_cpu[0].td_msgport); |
| 317 | } |
| 318 | |
| 319 | lwkt_port_t |
| 320 | cpu_portfn(int cpu) |
| 321 | { |
| 322 | return (&netisr_cpu[cpu].td_msgport); |
| 323 | } |
| 324 | |
| 325 | /* ARGSUSED */ |
| 326 | lwkt_port_t |
| 327 | cpu0_soport(struct socket *so __unused, struct sockaddr *nam __unused, |
| 328 | int req __unused) |
| 329 | { |
| 330 | return (&netisr_cpu[0].td_msgport); |
| 331 | } |
| 332 | |
| 333 | lwkt_port_t |
| 334 | sync_soport(struct socket *so __unused, struct sockaddr *nam __unused, |
| 335 | int req __unused) |
| 336 | { |
| 337 | return (&netisr_sync_port); |
| 338 | } |
| 339 | |
| 340 | /* |
| 341 | * schednetisr() is used to call the netisr handler from the appropriate |
| 342 | * netisr thread for polling and other purposes. |
| 343 | * |
| 344 | * This function may be called from a hard interrupt or IPI and must be |
| 345 | * MP SAFE and non-blocking. We use a fixed per-cpu message instead of |
| 346 | * trying to allocate one. We must get ourselves onto the target cpu |
| 347 | * to safely check the MSGF_DONE bit on the message but since the message |
| 348 | * will be sent to that cpu anyway this does not add any extra work beyond |
| 349 | * what lwkt_sendmsg() would have already had to do to schedule the target |
| 350 | * thread. |
| 351 | */ |
| 352 | static void |
| 353 | schednetisr_remote(void *data) |
| 354 | { |
| 355 | int num = (int)data; |
| 356 | struct netisr *ni = &netisrs[num]; |
| 357 | lwkt_port_t port = &netisr_cpu[0].td_msgport; |
| 358 | struct netmsg *pmsg; |
| 359 | |
| 360 | pmsg = &netisrs[num].ni_netmsg; |
| 361 | crit_enter(); |
| 362 | if (pmsg->nm_lmsg.ms_flags & MSGF_DONE) { |
| 363 | netmsg_init(pmsg, &netisr_adone_rport, 0, ni->ni_handler); |
| 364 | pmsg->nm_lmsg.u.ms_result = num; |
| 365 | lwkt_sendmsg(port, &pmsg->nm_lmsg); |
| 366 | } |
| 367 | crit_exit(); |
| 368 | } |
| 369 | |
| 370 | void |
| 371 | schednetisr(int num) |
| 372 | { |
| 373 | KASSERT((num > 0 && num <= (sizeof(netisrs)/sizeof(netisrs[0]))), |
| 374 | ("schednetisr: bad isr %d", num)); |
| 375 | #ifdef SMP |
| 376 | if (mycpu->gd_cpuid != 0) |
| 377 | lwkt_send_ipiq(globaldata_find(0), schednetisr_remote, (void *)num); |
| 378 | else |
| 379 | schednetisr_remote((void *)num); |
| 380 | #else |
| 381 | schednetisr_remote((void *)num); |
| 382 | #endif |
| 383 | } |
| 384 | |