Fix LINT build.
[dragonfly.git] / sys / netinet / ip_divert.c
index 6f97103..c714e71 100644 (file)
  * SUCH DAMAGE.
  *
  * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.42.2.6 2003/01/23 21:06:45 sam Exp $
- * $DragonFly: src/sys/netinet/ip_divert.c,v 1.38 2008/08/28 14:10:03 sephe Exp $
+ * $DragonFly: src/sys/netinet/ip_divert.c,v 1.40 2008/10/21 13:51:01 sephe Exp $
  */
 
+#define        _IP_VHL
+
 #include "opt_inet.h"
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/socketvar.h>
+#include <sys/socketvar2.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
-#include <sys/thread2.h>
+#include <sys/priv.h>
+#include <sys/in_cksum.h>
+#include <sys/lock.h>
 #ifdef SMP
 #include <sys/msgport.h>
 #endif
 
-#include <vm/vm_zone.h>
-
 #include <net/if.h>
 #include <net/route.h>
+
 #ifdef SMP
 #include <net/netmsg2.h>
 #endif
+#include <sys/thread2.h>
+#include <sys/mplock2.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
@@ -119,6 +125,10 @@ static struct inpcbinfo divcbinfo;
 static u_long  div_sendspace = DIVSNDQ;        /* XXX sysctl ? */
 static u_long  div_recvspace = DIVRCVQ;        /* XXX sysctl ? */
 
+static struct mbuf *ip_divert(struct mbuf *, int, int);
+
+static struct lwkt_token div_token = LWKT_TOKEN_MP_INITIALIZER(div_token);
+
 /*
  * Initialize divert connection block queue.
  */
@@ -135,8 +145,8 @@ div_init(void)
        divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask);
        divcbinfo.wildcardhashbase = hashinit(1, M_PCB,
                                              &divcbinfo.wildcardhashmask);
-       divcbinfo.ipi_zone = zinit("divcb", sizeof(struct inpcb),
-                                  maxsockets, ZONE_INTERRUPT, 0);
+       divcbinfo.ipi_size = sizeof(struct inpcb);
+       ip_divert_p = ip_divert;
 }
 
 /*
@@ -151,17 +161,12 @@ div_input(struct mbuf *m, ...)
 }
 
 struct lwkt_port *
-div_soport(struct socket *so, struct sockaddr *nam,
-          struct mbuf **mptr, int req)
+div_soport(struct socket *so, struct sockaddr *nam, struct mbuf **mptr)
 {
        struct sockaddr_in *sin;
        struct mbuf *m;
        int dir;
 
-       /* Except for send(), everything happens on CPU0 */
-       if (req != PRU_SEND)
-               return cpu0_soport(so, nam, mptr, req);
-
        sin = (struct sockaddr_in *)nam;
        m = *mptr;
        M_ASSERTPKTHDR(m);
@@ -213,7 +218,16 @@ div_soport(struct socket *so, struct sockaddr *nam,
                m->m_pkthdr.rcvif = ifa->ifa_ifp;
        }
 
-       return ip_mport(mptr, dir);
+       /*
+        * Recalculate the protocol thread.
+        */
+       ip_cpufn(mptr, 0, dir);
+       m = *mptr;
+       if (m) {
+               KKASSERT(m->m_flags & M_HASH);
+               return(cpu_portfn(m->m_pkthdr.hash));
+       }
+       return(NULL);
 }
 
 /*
@@ -282,12 +296,21 @@ div_packet(struct mbuf *m, int incoming, int port)
                 * (see div_output for the other half of this.)
                 */
                ksnprintf(divsrc.sin_zero, sizeof divsrc.sin_zero,
-                        m->m_pkthdr.rcvif->if_xname);
+                         m->m_pkthdr.rcvif->if_xname);
        }
 
        /* Put packet on socket queue, if any */
        sa = NULL;
        nport = htons((u_int16_t)port);
+
+       /*
+        * XXX
+        * Following loop to locate the inpcb is MPSAFE since the inpcb
+        * insertion/removal happens on the same CPU (CPU0), however,
+        * saving/testing the socket pointer is not MPSAFE.  So we still
+        * need to hold BGL here.
+        */
+       lwkt_gettoken(&div_token);
        LIST_FOREACH(inp, &divcbinfo.pcblisthead, inp_list) {
                if (inp->inp_flags & INP_PLACEMARKER)
                        continue;
@@ -295,16 +318,18 @@ div_packet(struct mbuf *m, int incoming, int port)
                        sa = inp->inp_socket;
        }
        if (sa) {
-               if (ssb_appendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, m,
-                                (struct mbuf *)NULL) == 0)
+               lwkt_gettoken(&sa->so_rcv.ssb_token);
+               if (ssb_appendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, m, NULL) == 0)
                        m_freem(m);
                else
                        sorwakeup(sa);
+               lwkt_reltoken(&sa->so_rcv.ssb_token);
        } else {
                m_freem(m);
                ipstat.ips_noproto++;
                ipstat.ips_delivered--;
        }
+       lwkt_reltoken(&div_token);
 }
 
 #ifdef SMP
@@ -328,7 +353,7 @@ div_packet_handler(struct netmsg *nmsg)
 }
 #endif /* SMP */
 
-void
+static void
 divert_packet(struct mbuf *m, int incoming)
 {
        struct m_tag *mtag;
@@ -355,8 +380,8 @@ divert_packet(struct mbuf *m, int incoming)
                struct lwkt_msg *msg;
 
                nmp = &m->m_hdr.mh_netmsg;
-               netmsg_init(&nmp->nm_netmsg, &netisr_apanic_rport, 0,
-                           div_packet_handler);
+               netmsg_init(&nmp->nm_netmsg, NULL, &netisr_apanic_rport,
+                           0, div_packet_handler);
                nmp->nm_packet = m;
 
                msg = &nmp->nm_netmsg.nm_lmsg;
@@ -448,15 +473,18 @@ div_attach(struct socket *so, int proto, struct pru_attach_info *ai)
        inp  = so->so_pcb;
        if (inp)
                panic("div_attach");
-       if ((error = suser_cred(ai->p_ucred, NULL_CRED_OKAY)) != 0)
+       if ((error = priv_check_cred(ai->p_ucred, PRIV_ROOT, NULL_CRED_OKAY)) != 0)
                return error;
 
        error = soreserve(so, div_sendspace, div_recvspace, ai->sb_rlimit);
        if (error)
                return error;
+       lwkt_gettoken(&div_token);
        error = in_pcballoc(so, &divcbinfo);
-       if (error)
+       if (error) {
+               lwkt_reltoken(&div_token);
                return error;
+       }
        inp = (struct inpcb *)so->so_pcb;
        inp->inp_ip_p = proto;
        inp->inp_vflag |= INP_IPV4;
@@ -465,7 +493,9 @@ div_attach(struct socket *so, int proto, struct pru_attach_info *ai)
         * The socket is always "connected" because
         * we always know "where" to send the packet.
         */
-       so->so_state |= SS_ISCONNECTED;
+       so->so_port = cpu0_soport(so, NULL, NULL);
+       sosetstate(so, SS_ISCONNECTED);
+       lwkt_reltoken(&div_token);
        return 0;
 }
 
@@ -481,19 +511,33 @@ div_detach(struct socket *so)
        return 0;
 }
 
+/*
+ * NOTE: (so) is referenced from soabort*() and netmsg_pru_abort()
+ *      will sofree() it when we return.
+ */
 static int
 div_abort(struct socket *so)
 {
+       int error;
+
        soisdisconnected(so);
-       return div_detach(so);
+       error = div_detach(so);
+
+       return error;
 }
 
 static int
 div_disconnect(struct socket *so)
 {
+       int error;
+
        if (!(so->so_state & SS_ISCONNECTED))
                return ENOTCONN;
-       return div_abort(so);
+       soreference(so);
+       error = div_abort(so);
+       sofree(so);
+
+       return error;
 }
 
 static int
@@ -529,7 +573,7 @@ static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
         struct mbuf *control, struct thread *td)
 {
-       /* Length check already done in ip_mport() */
+       /* Length check already done in ip_cpufn() */
        KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf"));
 
        /* Send packet */
@@ -559,6 +603,155 @@ struct pr_usrreqs div_usrreqs = {
        .pru_shutdown = div_shutdown,
        .pru_sockaddr = in_setsockaddr,
        .pru_sosend = sosend,
-       .pru_soreceive = soreceive,
-       .pru_sopoll = sopoll
+       .pru_soreceive = soreceive
 };
+
+static struct mbuf *
+ip_divert_out(struct mbuf *m, int tee)
+{
+       struct mbuf *clone = NULL;
+       struct ip *ip = mtod(m, struct ip *);
+
+       /* Clone packet if we're doing a 'tee' */
+       if (tee)
+               clone = m_dup(m, MB_DONTWAIT);
+
+       /*
+        * XXX
+        * delayed checksums are not currently compatible
+        * with divert sockets.
+        */
+       if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+               in_delayed_cksum(m);
+               m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+       }
+
+       /* Restore packet header fields to original values */
+       ip->ip_len = htons(ip->ip_len);
+       ip->ip_off = htons(ip->ip_off);
+
+       /* Deliver packet to divert input routine */
+       divert_packet(m, 0);
+
+       /* If 'tee', continue with original packet */
+       return clone;
+}
+
+static struct mbuf *
+ip_divert_in(struct mbuf *m, int tee)
+{
+       struct mbuf *clone = NULL;
+       struct ip *ip = mtod(m, struct ip *);
+       struct m_tag *mtag;
+
+       if (ip->ip_off & (IP_MF | IP_OFFMASK)) {
+               const struct divert_info *divinfo;
+               u_short frag_off;
+               int hlen;
+
+               /*
+                * Only trust divert info in the fragment
+                * at offset 0.
+                */
+               frag_off = ip->ip_off << 3;
+               if (frag_off != 0) {
+                       mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
+                       m_tag_delete(m, mtag);
+               }
+
+               /*
+                * Attempt reassembly; if it succeeds, proceed.
+                * ip_reass() will return a different mbuf.
+                */
+               m = ip_reass(m);
+               if (m == NULL)
+                       return NULL;
+               ip = mtod(m, struct ip *);
+
+               /* Caller need to redispatch the packet, if it is for us */
+               m->m_pkthdr.fw_flags |= FW_MBUF_REDISPATCH;
+
+               /*
+                * Get the header length of the reassembled
+                * packet
+                */
+               hlen = IP_VHL_HL(ip->ip_vhl) << 2;
+
+               /*
+                * Restore original checksum before diverting
+                * packet
+                */
+               ip->ip_len += hlen;
+               ip->ip_len = htons(ip->ip_len);
+               ip->ip_off = htons(ip->ip_off);
+               ip->ip_sum = 0;
+               if (hlen == sizeof(struct ip))
+                       ip->ip_sum = in_cksum_hdr(ip);
+               else
+                       ip->ip_sum = in_cksum(m, hlen);
+               ip->ip_off = ntohs(ip->ip_off);
+               ip->ip_len = ntohs(ip->ip_len);
+
+               /*
+                * Only use the saved divert info
+                */
+               mtag = m_tag_find(m, PACKET_TAG_IPFW_DIVERT, NULL);
+               if (mtag == NULL) {
+                       /* Wrongly configured ipfw */
+                       kprintf("ip_input no divert info\n");
+                       m_freem(m);
+                       return NULL;
+               }
+               divinfo = m_tag_data(mtag);
+               tee = divinfo->tee;
+       }
+
+       /*
+        * Divert or tee packet to the divert protocol if
+        * required.
+        */
+
+       /* Clone packet if we're doing a 'tee' */
+       if (tee)
+               clone = m_dup(m, MB_DONTWAIT);
+
+       /*
+        * Restore packet header fields to original
+        * values
+        */
+       ip->ip_len = htons(ip->ip_len);
+       ip->ip_off = htons(ip->ip_off);
+
+       /* Deliver packet to divert input routine */
+       divert_packet(m, 1);
+
+       /* Catch invalid reference */
+       m = NULL;
+       ip = NULL;
+
+       ipstat.ips_delivered++;
+
+       /* If 'tee', continue with original packet */
+       if (clone != NULL) {
+               /*
+                * Complete processing of the packet.
+                * XXX Better safe than sorry, remove the DIVERT tag.
+                */
+               mtag = m_tag_find(clone, PACKET_TAG_IPFW_DIVERT, NULL);
+               KKASSERT(mtag != NULL);
+               m_tag_delete(clone, mtag);
+       }
+       return clone;
+}
+
+static struct mbuf *
+ip_divert(struct mbuf *m, int tee, int incoming)
+{
+       struct mbuf *ret;
+
+       if (incoming)
+               ret = ip_divert_in(m, tee);
+       else
+               ret = ip_divert_out(m, tee);
+       return ret;
+}