kernel - Fix excessive mbuf use in nfs_realign()
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
index aa477be..c241c3c 100644 (file)
  * 4 - write
  */
 static int proct[NFS_NPROCS] = {
-       0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
-       0, 0, 0,
+       0, 1, 0, 2, 1, 3, 3, 4, 0, 0,   /* 00-09        */
+       0, 0, 0, 0, 0, 0, 3, 3, 0, 0,   /* 10-19        */
+       0, 5, 0, 0, 0, 0,               /* 20-29        */
+};
+
+static int multt[NFS_NPROCS] = {
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 00-09        */
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1,   /* 10-19        */
+       1, 2, 1, 1, 1, 1,               /* 20-29        */
 };
 
 static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
 static int nfs_realign_test;
 static int nfs_realign_count;
-static int nfs_bufpackets = 4;
 static int nfs_showrtt;
 static int nfs_showrexmit;
+int nfs_maxasyncbio = NFS_MAXASYNCBIO;
 
 SYSCTL_DECL(_vfs_nfs);
 
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
-SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
+SYSCTL_INT(_vfs_nfs, OID_AUTO, maxasyncbio, CTLFLAG_RW, &nfs_maxasyncbio, 0, "");
 
 static int nfs_request_setup(nfsm_info_t info);
 static int nfs_request_auth(struct nfsreq *rep);
@@ -183,19 +190,19 @@ int
 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
 {
        struct socket *so;
-       int error, rcvreserve, sndreserve;
-       int pktscale;
+       int error;
        struct sockaddr *saddr;
        struct sockaddr_in *sin;
        struct thread *td = &thread0; /* only used for socreate and sobind */
 
-       nmp->nm_so = NULL;
+       nmp->nm_so = so = NULL;
+       if (nmp->nm_flag & NFSMNT_FORCE)
+               return (EINVAL);
        saddr = nmp->nm_nam;
-       error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
+       error = socreate(saddr->sa_family, &so, nmp->nm_sotype,
                nmp->nm_soproto, td);
        if (error)
                goto bad;
-       so = nmp->nm_so;
        nmp->nm_soflags = so->so_proto->pr_flags;
 
        /*
@@ -283,23 +290,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
         * Get buffer reservation size from sysctl, but impose reasonable
         * limits.
         */
-       pktscale = nfs_bufpackets;
-       if (pktscale < 2)
-               pktscale = 2;
-       if (pktscale > 64)
-               pktscale = 64;
-
-       if (nmp->nm_sotype == SOCK_DGRAM) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
-               rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
-                   NFS_MAXPKTHDR) * pktscale;
-       } else {
-               if (nmp->nm_sotype != SOCK_STREAM)
-                       panic("nfscon sotype");
+       if (nmp->nm_sotype == SOCK_STREAM) {
                if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                        struct sockopt sopt;
                        int val;
@@ -324,13 +315,8 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
                        val = 1;
                        sosetopt(so, &sopt);
                }
-               sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
-               rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
-                   sizeof (u_int32_t)) * pktscale;
        }
-       error = soreserve(so, sndreserve, rcvreserve,
-                         &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
+       error = soreserve(so, nfs_soreserve, nfs_soreserve, NULL);
        if (error)
                goto bad;
        so->so_rcv.ssb_flags |= SSB_NOINTR;
@@ -343,10 +329,19 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
                nmp->nm_sdrtt[3] = 0;
        nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
        nmp->nm_timeouts = 0;
+
+       /*
+        * Assign nm_so last.  The moment nm_so is assigned the nfs_timer()
+        * can mess with the socket.
+        */
+       nmp->nm_so = so;
        return (0);
 
 bad:
-       nfs_disconnect(nmp);
+       if (so) {
+               soshutdown(so, SHUT_RDWR);
+               soclose(so, FNONBLOCK);
+       }
        return (error);
 }
 
@@ -366,9 +361,15 @@ nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep)
        int error;
 
        nfs_disconnect(nmp);
+       if (nmp->nm_rxstate >= NFSSVC_STOPPING)
+               return (EINTR);
        while ((error = nfs_connect(nmp, rep)) != 0) {
                if (error == EINTR || error == ERESTART)
                        return (EINTR);
+               if (error == EINVAL)
+                       return (error);
+               if (nmp->nm_rxstate >= NFSSVC_STOPPING)
+                       return (EINTR);
                (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
        }
 
@@ -464,8 +465,17 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
                /*
                 * do backoff retransmit on client
                 */
-               if (rep)
+               if (rep) {
+                       if ((rep->r_nmp->nm_state & NFSSTA_SENDSPACE) == 0) {
+                               rep->r_nmp->nm_state |= NFSSTA_SENDSPACE;
+                               kprintf("Warning: NFS: Insufficient sendspace "
+                                       "(%lu),\n"
+                                       "\t You must increase vfs.nfs.soreserve"
+                                       "or decrease vfs.nfs.maxasyncbio\n",
+                                       so->so_snd.ssb_hiwat);
+                       }
                        rep->r_flags |= R_NEEDSXMIT;
+               }
        }
 
        if (error) {
@@ -637,8 +647,8 @@ tryagain:
                        if (error == 0 && sio.sb_cc != len) {
                            if (sio.sb_cc != 0)
                            log(LOG_INFO,
-                               "short receive (%d/%d) from nfs server %s\n",
-                               len - auio.uio_resid, len,
+                               "short receive (%zu/%d) from nfs server %s\n",
+                               (size_t)len - auio.uio_resid, len,
                                nmp->nm_mountp->mnt_stat.f_mntfromname);
                            error = EPIPE;
                        }
@@ -712,13 +722,22 @@ errout:
                                return (EINTR);
                        }
                } while (error == EWOULDBLOCK);
+
                len = sio.sb_cc;
                *mp = sio.sb_mb;
+
+               /*
+                * A shutdown may result in no error and no mbuf.
+                * Convert to EPIPE.
+                */
+               if (*mp == NULL && error == 0)
+                       error = EPIPE;
        }
        if (error) {
                m_freem(*mp);
                *mp = NULL;
        }
+
        /*
         * Search for any mbufs that are not a multiple of 4 bytes long
         * or with m_data not longword aligned.
@@ -841,6 +860,8 @@ nfsmout:
 
                /*
                 * Fill in the rest of the reply if we found a match.
+                *
+                * Deal with duplicate responses if there was no match.
                 */
                if (rep) {
                        rep->r_md = info.md;
@@ -879,7 +900,7 @@ nfsmout:
                         *
                         * NOTE SRTT/SDRTT are only good if R_TIMING is set.
                         */
-                       if (rep->r_flags & R_TIMING) {
+                       if ((rep->r_flags & R_TIMING) && rep->r_rexmit == 0) {
                                /*
                                 * Since the timer resolution of
                                 * NFS_HZ is so course, it can often
@@ -914,6 +935,25 @@ nfsmout:
                        nmp->nm_timeouts = 0;
                        rep->r_mrep = info.mrep;
                        nfs_hardterm(rep, 0);
+               } else {
+                       /*
+                        * Extract vers, prog, nfsver, procnum.  A duplicate
+                        * response means we didn't wait long enough so
+                        * we increase the SRTT to avoid future spurious
+                        * timeouts.
+                        */
+                       u_int procnum = nmp->nm_lastreprocnum;
+                       int n;
+
+                       if (procnum < NFS_NPROCS && proct[procnum]) {
+                               if (nfs_showrexmit)
+                                       kprintf("D");
+                               n = nmp->nm_srtt[proct[procnum]];
+                               n += NFS_ASYSCALE * NFS_HZ;
+                               if (n < NFS_ASYSCALE * NFS_HZ * 10)
+                                       n = NFS_ASYSCALE * NFS_HZ * 10;
+                               nmp->nm_srtt[proct[procnum]] = n;
+                       }
                }
                nfs_rcvunlock(nmp);
                crit_exit();
@@ -1200,7 +1240,7 @@ nfs_request_try(struct nfsreq *rep)
         * action possible is for r_mrep to be set (once we enqueue it).
         */
        if (rep->r_flags == 0xdeadc0de) {
-               print_backtrace();
+               print_backtrace(-1);
                panic("flags nbad\n");
        }
        KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0);
@@ -1220,6 +1260,12 @@ nfs_request_try(struct nfsreq *rep)
         */
        nfsstats.rpcrequests++;
 
+       if (nmp->nm_flag & NFSMNT_FORCE) {
+               rep->r_flags |= R_SOFTTERM;
+               rep->r_flags &= ~R_LOCKED;
+               return (0);
+       }
+
        /*
         * Chain request into list of outstanding requests. Be sure
         * to put it LAST so timer finds oldest requests first.  Note
@@ -1329,6 +1375,10 @@ nfs_request_waitreply(struct nfsreq *rep)
        TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
        rep->r_flags &= ~R_ONREQQ;
        --nmp->nm_reqqlen;
+       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+           nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
+               nfssvc_iod_writer_wakeup(nmp);
+       }
        crit_exit();
 
        /*
@@ -1561,6 +1611,9 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
                         */
 #ifdef NFSKERB
                        XXX
+#else
+                       ktvout.tv_sec = 0;
+                       ktvout.tv_usec = 0;
 #endif
 
                        *tl++ = rpc_auth_kerb;
@@ -1696,6 +1749,9 @@ nfs_timer_req(struct nfsreq *req)
         * to multiply by fairly large numbers.
         */
        if (req->r_rtt >= 0) {
+               /*
+                * Calculate the timeout to test against.
+                */
                req->r_rtt++;
                if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
                        timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
@@ -1704,6 +1760,7 @@ nfs_timer_req(struct nfsreq *req)
                } else {
                        timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
                }
+               timeo *= multt[req->r_procnum];
                /* timeo is still scaled by SCALE_BITS */
 
 #define NFSFS  (NFS_RTT_SCALE * NFS_HZ)
@@ -1775,13 +1832,12 @@ nfs_timer_req(struct nfsreq *req)
        /*
         * If there is enough space and the window allows.. resend it.
         *
-        * Set r_rtt to -1 in case we fail to send it now.
+        * r_rtt is left intact in case we get an answer after the
+        * retry that was a reply to the original packet.
         */
-       req->r_rtt = -1;
        if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
            (req->r_flags & (R_SENT | R_NEEDSXMIT)) &&
           (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
-               req->r_flags &= ~R_NEEDSXMIT;
                if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
                    error = so_pru_send(so, 0, m, NULL, NULL, td);
                else
@@ -1803,21 +1859,24 @@ nfs_timer_req(struct nfsreq *req)
                         * been filled in.  R_LOCKED will prevent
                         * the request from being ripped out from under
                         * us entirely.
+                        *
+                        * Record the last resent procnum to aid us
+                        * in duplicate detection on receive.
                         */
-                       if (req->r_flags & R_SENT) {
+                       if ((req->r_flags & R_NEEDSXMIT) == 0) {
                                if (nfs_showrexmit)
                                        kprintf("X");
-                               req->r_flags &= ~R_TIMING;
                                if (++req->r_rexmit > NFS_MAXREXMIT)
                                        req->r_rexmit = NFS_MAXREXMIT;
                                nmp->nm_maxasync_scaled >>= 1;
                                if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
                                        nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
                                nfsstats.rpcretries++;
+                               nmp->nm_lastreprocnum = req->r_procnum;
                        } else {
                                req->r_flags |= R_SENT;
+                               req->r_flags &= ~R_NEEDSXMIT;
                        }
-                       req->r_rtt = 0;
                }
        }
 }
@@ -1912,6 +1971,10 @@ nfs_hardterm(struct nfsreq *rep, int islocked)
                                 rep->r_info->state == NFSM_STATE_WAITREPLY);
                        rep->r_info->state = NFSM_STATE_PROCESSREPLY;
                        nfssvc_iod_reader_wakeup(nmp);
+                       if (TAILQ_FIRST(&nmp->nm_bioq) &&
+                           nmp->nm_reqqlen <= nfs_maxasyncbio * 2 / 3) {
+                               nfssvc_iod_writer_wakeup(nmp);
+                       }
                }
                mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
        }
@@ -2090,49 +2153,48 @@ nfs_rcvunlock(struct nfsmount *nmp)
 }
 
 /*
- *     nfs_realign:
+ * nfs_realign:
+ *
+ * Check for badly aligned mbuf data and realign by copying the unaligned
+ * portion of the data into a new mbuf chain and freeing the portions
+ * of the old chain that were replaced.
  *
- *     Check for badly aligned mbuf data and realign by copying the unaligned
- *     portion of the data into a new mbuf chain and freeing the portions
- *     of the old chain that were replaced.
+ * We cannot simply realign the data within the existing mbuf chain
+ * because the underlying buffers may contain other rpc commands and
+ * we cannot afford to overwrite them.
  *
- *     We cannot simply realign the data within the existing mbuf chain
- *     because the underlying buffers may contain other rpc commands and
- *     we cannot afford to overwrite them.
+ * We would prefer to avoid this situation entirely.  The situation does
+ * not occur with NFS/UDP and is supposed to only occassionally occur
+ * with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
  *
- *     We would prefer to avoid this situation entirely.  The situation does
- *     not occur with NFS/UDP and is supposed to only occassionally occur
- *     with TCP.  Use vfs.nfs.realign_count and realign_test to check this.
+ * NOTE!  MB_DONTWAIT cannot be used here.  The mbufs must be acquired
+ *       because the rpc request OR reply cannot be thrown away.  TCP NFS
+ *       mounts do not retry their RPCs unless the TCP connection itself
+ *       is dropped so throwing away a RPC will basically cause the NFS
+ *       operation to lockup indefinitely.
  */
 static void
 nfs_realign(struct mbuf **pm, int hsiz)
 {
        struct mbuf *m;
        struct mbuf *n = NULL;
-       int off = 0;
 
+       /*
+        * Check for misalignemnt
+        */
        ++nfs_realign_test;
-
        while ((m = *pm) != NULL) {
-               if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
-                       n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
-                       n->m_len = 0;
+               if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3))
                        break;
-               }
                pm = &m->m_next;
        }
 
        /*
-        * If n is non-NULL, loop on m copying data, then replace the
-        * portion of the chain that had to be realigned.
+        * If misalignment found make a completely new copy.
         */
-       if (n != NULL) {
+       if (m) {
                ++nfs_realign_count;
-               while (m) {
-                       m_copyback(n, off, m->m_len, mtod(m, caddr_t));
-                       off += m->m_len;
-                       m = m->m_next;
-               }
+               n = m_dup_data(m, MB_WAIT);
                m_freem(*pm);
                *pm = n;
        }
@@ -2232,7 +2294,9 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
                nd->nd_cr.cr_ref = 1;
                nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
+               nd->nd_cr.cr_ruid = nd->nd_cr.cr_svuid = nd->nd_cr.cr_uid;
                nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
+               nd->nd_cr.cr_rgid = nd->nd_cr.cr_svgid = nd->nd_cr.cr_gid;
                len = fxdr_unsigned(int, *tl);
                if (len < 0 || len > RPCAUTH_UNIXGIDS) {
                        m_freem(info.mrep);
@@ -2336,6 +2400,9 @@ nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
                         */
 #ifdef NFSKERB
                        XXX
+#else
+                       tvout.tv_sec = 0;
+                       tvout.tv_usec = 0;
 #endif
 
                        tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);