sys/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
   3  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to The DragonFly Project
   6  * by Jeffrey M. Hsu.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of The DragonFly Project nor the names of its
  17  *    contributors may be used to endorse or promote products derived
  18  *    from this software without specific, prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  */
  33
  34 /*
  35  * Copyright (c) 2004 Jeffrey M. Hsu.  All rights reserved.
  36  *
  37  * License terms: all terms for the DragonFly license above plus the following:
  38  *
  39  * 4. All advertising materials mentioning features or use of this software
  40  *    must display the following acknowledgement:
  41  *
  42  *      This product includes software developed by Jeffrey M. Hsu
  43  *      for the DragonFly Project.
  44  *
  45  *    This requirement may be waived with permission from Jeffrey Hsu.
  46  *    This requirement will sunset and may be removed on July 8 2005,
  47  *    after which the standard DragonFly license (as shown above) will
  48  *    apply.
  49  */
  50
  51 /*
  52  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  53  *      The Regents of the University of California.  All rights reserved.
  54  *
  55  * Redistribution and use in source and binary forms, with or without
  56  * modification, are permitted provided that the following conditions
  57  * are met:
  58  * 1. Redistributions of source code must retain the above copyright
  59  *    notice, this list of conditions and the following disclaimer.
  60  * 2. Redistributions in binary form must reproduce the above copyright
  61  *    notice, this list of conditions and the following disclaimer in the
  62  *    documentation and/or other materials provided with the distribution.
  63  * 3. All advertising materials mentioning features or use of this software
  64  *    must display the following acknowledgement:
  65  *      This product includes software developed by the University of
  66  *      California, Berkeley and its contributors.
  67  * 4. Neither the name of the University nor the names of its contributors
  68  *    may be used to endorse or promote products derived from this software
  69  *    without specific prior written permission.
  70  *
  71  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  81  * SUCH DAMAGE.
  82  *
  83  * @(#)uipc_mbuf.c      8.2 (Berkeley) 1/4/94
  84  * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
  85  * $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.21 2004/07/08 22:07:34 hsu Exp $
  86  */
  87
  88 #include "opt_param.h"
  89 #include "opt_mbuf_stress_test.h"
  90 #include <sys/param.h>
  91 #include <sys/systm.h>
  92 #include <sys/malloc.h>
  93 #include <sys/mbuf.h>
  94 #include <sys/kernel.h>
  95 #include <sys/sysctl.h>
  96 #include <sys/domain.h>
  97 #include <sys/protosw.h>
  98 #include <sys/uio.h>
  99 #include <sys/thread.h>
 100 #include <sys/globaldata.h>
 101
 102 #include <vm/vm.h>
 103 #include <vm/vm_kern.h>
 104 #include <vm/vm_extern.h>
 105
 106 #ifdef INVARIANTS
 107 #include <machine/cpu.h>
 108 #endif
 109
 110 static void mbinit (void *);
 111 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
 112
 113 struct mbuf *mbutl;
 114 struct mbuf *mbute;
 115 char    *mclrefcnt;
 116 struct mbstat mbstat;
 117 u_long  mbtypes[MT_NTYPES];
 118 struct mbuf *mmbfree;
 119 union mcluster *mclfree;
 120 int     max_linkhdr;
 121 int     max_protohdr;
 122 int     max_hdr;
 123 int     max_datalen;
 124 int     m_defragpackets;
 125 int     m_defragbytes;
 126 int     m_defraguseless;
 127 int     m_defragfailure;
 128 #ifdef MBUF_STRESS_TEST
 129 int     m_defragrandomfailures;
 130 #endif
 131
 132 int     nmbclusters;
 133 int     nmbufs;
 134 u_int   m_mballoc_wid = 0;
 135 u_int   m_clalloc_wid = 0;
 136
 137 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
 138            &max_linkhdr, 0, "");
 139 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
 140            &max_protohdr, 0, "");
 141 SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
 142 SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
 143            &max_datalen, 0, "");
 144 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
 145            &mbuf_wait, 0, "");
 146 SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
 147 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
 148            sizeof(mbtypes), "LU", "");
 149 SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD,
 150            &nmbclusters, 0, "Maximum number of mbuf clusters available");
 151 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
 152            "Maximum number of mbufs available");
 153 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
 154            &m_defragpackets, 0, "");
 155 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
 156            &m_defragbytes, 0, "");
 157 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
 158            &m_defraguseless, 0, "");
 159 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
 160            &m_defragfailure, 0, "");
 161 #ifdef MBUF_STRESS_TEST
 162 SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
 163            &m_defragrandomfailures, 0, "");
 164 #endif
 165
 166 static void     m_reclaim (void);
 167
 168 #ifndef NMBCLUSTERS
 169 #define NMBCLUSTERS     (512 + maxusers * 16)
 170 #endif
 171 #ifndef NMBUFS
 172 #define NMBUFS          (nmbclusters * 4)
 173 #endif
 174
 175 /*
 176  * Perform sanity checks of tunables declared above.
 177  */
 178 static void
 179 tunable_mbinit(void *dummy)
 180 {
 181
 182         /*
 183          * This has to be done before VM init.
 184          */
 185         nmbclusters = NMBCLUSTERS;
 186         TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 187         nmbufs = NMBUFS;
 188         TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 189         /* Sanity checks */
 190         if (nmbufs < nmbclusters * 2)
 191                 nmbufs = nmbclusters * 2;
 192
 193         return;
 194 }
 195 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
 196
 197 /* "number of clusters of pages" */
 198 #define NCL_INIT        1
 199
 200 #define NMB_INIT        16
 201
 202 /* ARGSUSED*/
 203 static void
 204 mbinit(void *dummy)
 205 {
 206         int s;
 207
 208         mmbfree = NULL; mclfree = NULL;
 209         mbstat.m_msize = MSIZE;
 210         mbstat.m_mclbytes = MCLBYTES;
 211         mbstat.m_minclsize = MINCLSIZE;
 212         mbstat.m_mlen = MLEN;
 213         mbstat.m_mhlen = MHLEN;
 214
 215         s = splimp();
 216         if (m_mballoc(NMB_INIT, MB_DONTWAIT) == 0)
 217                 goto bad;
 218 #if MCLBYTES <= PAGE_SIZE
 219         if (m_clalloc(NCL_INIT, MB_DONTWAIT) == 0)
 220                 goto bad;
 221 #else
 222         /* It's OK to call contigmalloc in this context. */
 223         if (m_clalloc(16, MB_WAIT) == 0)
 224                 goto bad;
 225 #endif
 226         splx(s);
 227         return;
 228 bad:
 229         panic("mbinit");
 230 }
 231
 232 /*
 233  * Allocate at least nmb mbufs and place on mbuf free list.
 234  * Must be called at splimp.
 235  */
 236 /* ARGSUSED */
 237 int
 238 m_mballoc(int nmb, int how)
 239 {
 240         caddr_t p;
 241         int i;
 242         int nbytes;
 243
 244         /*
 245          * If we've hit the mbuf limit, stop allocating from mb_map,
 246          * (or trying to) in order to avoid dipping into the section of
 247          * mb_map which we've "reserved" for clusters.
 248          */
 249         if ((nmb + mbstat.m_mbufs) > nmbufs)
 250                 return (0);
 251
 252         /*
 253          * Once we run out of map space, it will be impossible to get
 254          * any more (nothing is ever freed back to the map)
 255          * -- however you are not dead as m_reclaim might
 256          * still be able to free a substantial amount of space.
 257          *
 258          * XXX Furthermore, we can also work with "recycled" mbufs (when
 259          * we're calling with MB_WAIT the sleep procedure will be woken
 260          * up when an mbuf is freed. See m_mballoc_wait()).
 261          */
 262         if (mb_map_full)
 263                 return (0);
 264
 265         nbytes = round_page(nmb * MSIZE);
 266         p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
 267         if (p == 0 && how == MB_WAIT) {
 268                 mbstat.m_wait++;
 269                 p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
 270         }
 271
 272         /*
 273          * Either the map is now full, or `how' is M_NOWAIT and there
 274          * are no pages left.
 275          */
 276         if (p == NULL)
 277                 return (0);
 278
 279         nmb = nbytes / MSIZE;
 280         for (i = 0; i < nmb; i++) {
 281                 ((struct mbuf *)p)->m_next = mmbfree;
 282                 mmbfree = (struct mbuf *)p;
 283                 p += MSIZE;
 284         }
 285         mbstat.m_mbufs += nmb;
 286         mbtypes[MT_FREE] += nmb;
 287         return (1);
 288 }
 289
 290 /*
 291  * Once the mb_map has been exhausted and if the call to the allocation macros
 292  * (or, in some cases, functions) is with MB_WAIT, then it is necessary to rely
 293  * solely on reclaimed mbufs. Here we wait for an mbuf to be freed for a
 294  * designated (mbuf_wait) time.
 295  */
 296 struct mbuf *
 297 m_mballoc_wait(int caller, int type)
 298 {
 299         struct mbuf *p;
 300         int s;
 301
 302         s = splimp();
 303         m_mballoc_wid++;
 304         if ((tsleep(&m_mballoc_wid, 0, "mballc", mbuf_wait)) == EWOULDBLOCK)
 305                 m_mballoc_wid--;
 306         splx(s);
 307
 308         /*
 309          * Now that we (think) that we've got something, we will redo an
 310          * MGET, but avoid getting into another instance of m_mballoc_wait()
 311          * XXX: We retry to fetch _even_ if the sleep timed out. This is left
 312          *      this way, purposely, in the [unlikely] case that an mbuf was
 313          *      freed but the sleep was not awakened in time.
 314          */
 315         p = NULL;
 316         switch (caller) {
 317         case MGET_C:
 318                 MGET(p, MB_DONTWAIT, type);
 319                 break;
 320         case MGETHDR_C:
 321                 MGETHDR(p, MB_DONTWAIT, type);
 322                 break;
 323         default:
 324                 panic("m_mballoc_wait: invalid caller (%d)", caller);
 325         }
 326
 327         s = splimp();
 328         if (p != NULL) {                /* We waited and got something... */
 329                 mbstat.m_wait++;
 330                 /* Wake up another if we have more free. */
 331                 if (mmbfree != NULL)
 332                         MMBWAKEUP();
 333         }
 334         splx(s);
 335         return (p);
 336 }
 337
 338 #if MCLBYTES > PAGE_SIZE
 339 static int i_want_my_mcl;
 340
 341 static void
 342 kproc_mclalloc(void)
 343 {
 344         int status;
 345
 346         while (1) {
 347                 tsleep(&i_want_my_mcl, 0, "mclalloc", 0);
 348
 349                 for (; i_want_my_mcl; i_want_my_mcl--) {
 350                         if (m_clalloc(1, MB_WAIT) == 0)
 351                                 printf("m_clalloc failed even in process context!\n");
 352                 }
 353         }
 354 }
 355
 356 static struct thread *mclallocthread;
 357 static struct kproc_desc mclalloc_kp = {
 358         "mclalloc",
 359         kproc_mclalloc,
 360         &mclallocthread
 361 };
 362 SYSINIT(mclallocthread, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
 363            &mclalloc_kp);
 364 #endif
 365
 366 /*
 367  * Allocate some number of mbuf clusters
 368  * and place on cluster free list.
 369  * Must be called at splimp.
 370  */
 371 /* ARGSUSED */
 372 int
 373 m_clalloc(int ncl, int how)
 374 {
 375         caddr_t p;
 376         int i;
 377         int npg;
 378
 379         /*
 380          * If we've hit the mcluster number limit, stop allocating from
 381          * mb_map, (or trying to) in order to avoid dipping into the section
 382          * of mb_map which we've "reserved" for mbufs.
 383          */
 384         if ((ncl + mbstat.m_clusters) > nmbclusters)
 385                 goto m_clalloc_fail;
 386
 387         /*
 388          * Once we run out of map space, it will be impossible
 389          * to get any more (nothing is ever freed back to the
 390          * map). From this point on, we solely rely on freed
 391          * mclusters.
 392          */
 393         if (mb_map_full)
 394                 goto m_clalloc_fail;
 395
 396 #if MCLBYTES > PAGE_SIZE
 397         if (how != MB_WAIT) {
 398                 i_want_my_mcl += ncl;
 399                 wakeup(&i_want_my_mcl);
 400                 mbstat.m_wait++;
 401                 p = 0;
 402         } else {
 403                 p = contigmalloc_map(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul,
 404                                   ~0ul, PAGE_SIZE, 0, mb_map);
 405         }
 406 #else
 407         npg = ncl;
 408         p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
 409                                  how != MB_WAIT ? M_NOWAIT : M_WAITOK);
 410         ncl = ncl * PAGE_SIZE / MCLBYTES;
 411 #endif
 412         /*
 413          * Either the map is now full, or `how' is M_NOWAIT and there
 414          * are no pages left.
 415          */
 416         if (p == NULL) {
 417                 static int last_report ; /* when we did that (in ticks) */
 418 m_clalloc_fail:
 419                 mbstat.m_drops++;
 420                 if (ticks < last_report || (ticks - last_report) >= hz) {
 421                         last_report = ticks;
 422                         printf("All mbuf clusters exhausted, please see tuning(7).\n");
 423                 }
 424                 return (0);
 425         }
 426
 427         for (i = 0; i < ncl; i++) {
 428                 ((union mcluster *)p)->mcl_next = mclfree;
 429                 mclfree = (union mcluster *)p;
 430                 p += MCLBYTES;
 431                 mbstat.m_clfree++;
 432         }
 433         mbstat.m_clusters += ncl;
 434         return (1);
 435 }
 436
 437 /*
 438  * Once the mb_map submap has been exhausted and the allocation is called with
 439  * MB_WAIT, we rely on the mclfree union pointers. If nothing is free, we will
 440  * sleep for a designated amount of time (mbuf_wait) or until we're woken up
 441  * due to sudden mcluster availability.
 442  */
 443 caddr_t
 444 m_clalloc_wait(void)
 445 {
 446         caddr_t p;
 447         int s;
 448
 449         /* If in interrupt context, and INVARIANTS, maintain sanity and die. */
 450         KASSERT(mycpu->gd_intr_nesting_level == 0, ("CLALLOC: CANNOT WAIT IN INTERRUPT"));
 451
 452         /* Sleep until something's available or until we expire. */
 453         m_clalloc_wid++;
 454         if ((tsleep(&m_clalloc_wid, 0, "mclalc", mbuf_wait)) == EWOULDBLOCK)
 455                 m_clalloc_wid--;
 456
 457         /*
 458          * Now that we (think) that we've got something, we will redo and
 459          * MGET, but avoid getting into another instance of m_clalloc_wait()
 460          */
 461         p = m_mclalloc(MB_DONTWAIT);
 462
 463         s = splimp();
 464         if (p != NULL) {        /* We waited and got something... */
 465                 mbstat.m_wait++;
 466                 /* Wake up another if we have more free. */
 467                 if (mclfree != NULL)
 468                         MCLWAKEUP();
 469         }
 470
 471         splx(s);
 472         return (p);
 473 }
 474
 475 /*
 476  * When MGET fails, ask protocols to free space when short of memory,
 477  * then re-attempt to allocate an mbuf.
 478  */
 479 struct mbuf *
 480 m_retry(int i, int t)
 481 {
 482         struct mbuf *m;
 483         int ms;
 484
 485         /*
 486          * Must only do the reclaim if not in an interrupt context.
 487          */
 488         if (i == MB_WAIT) {
 489                 KASSERT(mycpu->gd_intr_nesting_level == 0,
 490                     ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
 491                 m_reclaim();
 492         }
 493
 494         ms = splimp();
 495         if (mmbfree == NULL)
 496                 (void)m_mballoc(1, i);
 497         m = mmbfree;
 498         if (m != NULL) {
 499                 mmbfree = m->m_next;
 500                 mbtypes[MT_FREE]--;
 501                 m->m_type = t;
 502                 mbtypes[t]++;
 503                 m->m_next = NULL;
 504                 m->m_nextpkt = NULL;
 505                 m->m_data = m->m_dat;
 506                 m->m_flags = 0;
 507                 splx(ms);
 508                 mbstat.m_wait++;
 509         } else {
 510                 static int last_report ; /* when we did that (in ticks) */
 511
 512                 splx(ms);
 513                 mbstat.m_drops++;
 514                 if (ticks < last_report || (ticks - last_report) >= hz) {
 515                         last_report = ticks;
 516                         printf("All mbufs exhausted, please see tuning(7).\n");
 517                 }
 518         }
 519
 520         return (m);
 521 }
 522
 523 /*
 524  * As above; retry an MGETHDR.
 525  */
 526 struct mbuf *
 527 m_retryhdr(int i, int t)
 528 {
 529         struct mbuf *m;
 530         int ms;
 531
 532         /*
 533          * Must only do the reclaim if not in an interrupt context.
 534          */
 535         if (i == MB_WAIT) {
 536                 KASSERT(mycpu->gd_intr_nesting_level == 0,
 537                     ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
 538                 m_reclaim();
 539         }
 540
 541         ms = splimp();
 542         if (mmbfree == NULL)
 543                 (void)m_mballoc(1, i);
 544         m = mmbfree;
 545         if (m != NULL) {
 546                 mmbfree = m->m_next;
 547                 mbtypes[MT_FREE]--;
 548                 m->m_type = t;
 549                 mbtypes[t]++;
 550                 m->m_next = NULL;
 551                 m->m_nextpkt = NULL;
 552                 m->m_data = m->m_pktdat;
 553                 m->m_flags = M_PKTHDR;
 554                 m->m_pkthdr.rcvif = NULL;
 555                 SLIST_INIT(&m->m_pkthdr.tags);
 556                 m->m_pkthdr.csum_flags = 0;
 557                 splx(ms);
 558                 mbstat.m_wait++;
 559         } else {
 560                 static int last_report ; /* when we did that (in ticks) */
 561
 562                 splx(ms);
 563                 mbstat.m_drops++;
 564                 if (ticks < last_report || (ticks - last_report) >= hz) {
 565                         last_report = ticks;
 566                         printf("All mbufs exhausted, please see tuning(7).\n");
 567                 }
 568         }
 569
 570         return (m);
 571 }
 572
 573 static void
 574 m_reclaim(void)
 575 {
 576         struct domain *dp;
 577         struct protosw *pr;
 578         int s = splimp();
 579
 580         for (dp = domains; dp; dp = dp->dom_next) {
 581                 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 582                         if (pr->pr_drain)
 583                                 (*pr->pr_drain)();
 584                 }
 585         }
 586         splx(s);
 587         mbstat.m_drain++;
 588 }
 589
 590 /*
 591  * Space allocation routines.
 592  * These are also available as macros
 593  * for critical paths.
 594  */
 595 struct mbuf *
 596 m_get(int how, int type)
 597 {
 598         struct mbuf *m;
 599         int ms;
 600
 601         ms = splimp();
 602         if (mmbfree == NULL)
 603                 (void)m_mballoc(1, how);
 604         m = mmbfree;
 605         if (m != NULL) {
 606                 mmbfree = m->m_next;
 607                 mbtypes[MT_FREE]--;
 608                 m->m_type = type;
 609                 mbtypes[type]++;
 610                 m->m_next = NULL;
 611                 m->m_nextpkt = NULL;
 612                 m->m_data = m->m_dat;
 613                 m->m_flags = 0;
 614                 splx(ms);
 615         } else {
 616                 splx(ms);
 617                 m = m_retry(how, type);
 618                 if (m == NULL && how == MB_WAIT)
 619                         m = m_mballoc_wait(MGET_C, type);
 620         }
 621         return (m);
 622 }
 623
 624 struct mbuf *
 625 m_gethdr(int how, int type)
 626 {
 627         struct mbuf *m;
 628         int ms;
 629
 630         ms = splimp();
 631         if (mmbfree == NULL)
 632                 (void)m_mballoc(1, how);
 633         m = mmbfree;
 634         if (m != NULL) {
 635                 mmbfree = m->m_next;
 636                 mbtypes[MT_FREE]--;
 637                 m->m_type = type;
 638                 mbtypes[type]++;
 639                 m->m_next = NULL;
 640                 m->m_nextpkt = NULL;
 641                 m->m_data = m->m_pktdat;
 642                 m->m_flags = M_PKTHDR;
 643                 m->m_pkthdr.rcvif = NULL;
 644                 SLIST_INIT(&m->m_pkthdr.tags);
 645                 m->m_pkthdr.csum_flags = 0;
 646                 splx(ms);
 647         } else {
 648                 splx(ms);
 649                 m = m_retryhdr(how, type);
 650                 if (m == NULL && how == MB_WAIT)
 651                         m = m_mballoc_wait(MGETHDR_C, type);
 652         }
 653         return (m);
 654 }
 655
 656 struct mbuf *
 657 m_getclr(int how, int type)
 658 {
 659         struct mbuf *m;
 660
 661         MGET(m, how, type);
 662         if (m == 0)
 663                 return (0);
 664         bzero(mtod(m, caddr_t), MLEN);
 665         return (m);
 666 }
 667
 668 /*
 669  * m_getcl() returns an mbuf with an attached cluster.
 670  * Because many network drivers use this kind of buffers a lot, it is
 671  * convenient to keep a small pool of free buffers of this kind.
 672  * Even a small size such as 10 gives about 10% improvement in the
 673  * forwarding rate in a bridge or router.
 674  * The size of this free list is controlled by the sysctl variable
 675  * mcl_pool_max. The list is populated on m_freem(), and used in
 676  * m_getcl() if elements are available.
 677  */
 678 static struct mbuf *mcl_pool;
 679 static int mcl_pool_now;
 680 static int mcl_pool_max = 10;
 681
 682 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_max, CTLFLAG_RW, &mcl_pool_max, 0,
 683            "Maximum number of mbufs+cluster in free list");
 684 SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_pool_now, CTLFLAG_RD, &mcl_pool_now, 0,
 685            "Current number of mbufs+cluster in free list");
 686
 687 struct mbuf *
 688 m_getcl(int how, short type, int flags)
 689 {
 690         int s = splimp();
 691         struct mbuf *mp;
 692
 693         if (flags & M_PKTHDR) {
 694                 if (type == MT_DATA && mcl_pool) {
 695                         mp = mcl_pool;
 696                         mcl_pool = mp->m_nextpkt;
 697                         mcl_pool_now--;
 698                         splx(s);
 699                         mp->m_nextpkt = NULL;
 700                         mp->m_data = mp->m_ext.ext_buf;
 701                         mp->m_flags = M_PKTHDR|M_EXT;
 702                         mp->m_pkthdr.rcvif = NULL;
 703                         mp->m_pkthdr.csum_flags = 0;
 704                         return mp;
 705                 } else
 706                         MGETHDR(mp, how, type);
 707         } else
 708                 MGET(mp, how, type);
 709         if (mp) {
 710                 MCLGET(mp, how);
 711                 if ( (mp->m_flags & M_EXT) == 0) {
 712                         m_free(mp);
 713                         mp = NULL;
 714                 }
 715         }
 716         splx(s);
 717         return mp;
 718 }
 719
 720 /*
 721  * struct mbuf *
 722  * m_getm(m, len, how, type)
 723  *
 724  * This will allocate len-worth of mbufs and/or mbuf clusters (whatever fits
 725  * best) and return a pointer to the top of the allocated chain. If m is
 726  * non-null, then we assume that it is a single mbuf or an mbuf chain to
 727  * which we want len bytes worth of mbufs and/or clusters attached, and so
 728  * if we succeed in allocating it, we will just return a pointer to m.
 729  *
 730  * If we happen to fail at any point during the allocation, we will free
 731  * up everything we have already allocated and return NULL.
 732  *
 733  */
 734 struct mbuf *
 735 m_getm(struct mbuf *m, int len, int how, int type)
 736 {
 737         struct mbuf *top, *tail, *mp, *mtail = NULL;
 738
 739         KASSERT(len >= 0, ("len is < 0 in m_getm"));
 740
 741         MGET(mp, how, type);
 742         if (mp == NULL)
 743                 return (NULL);
 744         else if (len > MINCLSIZE) {
 745                 MCLGET(mp, how);
 746                 if ((mp->m_flags & M_EXT) == 0) {
 747                         m_free(mp);
 748                         return (NULL);
 749                 }
 750         }
 751         mp->m_len = 0;
 752         len -= M_TRAILINGSPACE(mp);
 753
 754         if (m != NULL)
 755                 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
 756         else
 757                 m = mp;
 758
 759         top = tail = mp;
 760         while (len > 0) {
 761                 MGET(mp, how, type);
 762                 if (mp == NULL)
 763                         goto failed;
 764
 765                 tail->m_next = mp;
 766                 tail = mp;
 767                 if (len > MINCLSIZE) {
 768                         MCLGET(mp, how);
 769                         if ((mp->m_flags & M_EXT) == 0)
 770                                 goto failed;
 771                 }
 772
 773                 mp->m_len = 0;
 774                 len -= M_TRAILINGSPACE(mp);
 775         }
 776
 777         if (mtail != NULL)
 778                 mtail->m_next = top;
 779         return (m);
 780
 781 failed:
 782         m_freem(top);
 783         return (NULL);
 784 }
 785
 786 /*
 787  * m_mclalloc() - Allocates an mbuf cluster.
 788  */
 789 caddr_t
 790 m_mclalloc(int how)
 791 {
 792         caddr_t mp;
 793         int s;
 794
 795         s = splimp();
 796
 797         if (mclfree == NULL)
 798                 m_clalloc(1, how);
 799         mp = (caddr_t)mclfree;
 800         if (mp != NULL) {
 801                 KKASSERT((struct mbuf *)mp >= mbutl &&
 802                          (struct mbuf *)mp < mbute);
 803                 mclrefcnt[mtocl(mp)]++;
 804                 mbstat.m_clfree--;
 805                 mclfree = ((union mcluster *)mp)->mcl_next;
 806                 splx(s);
 807                 return(mp);
 808         }
 809         splx(s);
 810         if (how == MB_WAIT)
 811                 return(m_clalloc_wait());
 812         return(NULL);
 813 }
 814
 815 /*
 816  *  m_mclget() - Adds a cluster to a normal mbuf, M_EXT is set on success.
 817  */
 818 void
 819 m_mclget(struct mbuf *m, int how)
 820 {
 821         m->m_ext.ext_buf = m_mclalloc(how);
 822         if (m->m_ext.ext_buf != NULL) {
 823                 m->m_data = m->m_ext.ext_buf;
 824                 m->m_flags |= M_EXT;
 825                 m->m_ext.ext_free = NULL;
 826                 m->m_ext.ext_ref = NULL;
 827                 m->m_ext.ext_size = MCLBYTES;
 828         }
 829 }
 830
 831 static __inline void
 832 _m_mclfree(caddr_t data)
 833 {
 834         union mcluster *mp = (union mcluster *)data;
 835
 836         KASSERT(mclrefcnt[mtocl(mp)] > 0, ("freeing free cluster"));
 837         KKASSERT((struct mbuf *)mp >= mbutl &&
 838                  (struct mbuf *)mp < mbute);
 839         if (--mclrefcnt[mtocl(mp)] == 0) {
 840                 mp->mcl_next = mclfree;
 841                 mclfree = mp;
 842                 mbstat.m_clfree++;
 843                 MCLWAKEUP();
 844         }
 845 }
 846
 847 void
 848 m_mclfree(caddr_t mp)
 849 {
 850         int s = splimp();
 851         _m_mclfree(mp);
 852         splx(s);
 853 }
 854
 855 /*
 856  * m_free()
 857  *
 858  * Free a single mbuf and any associated external storage.  The successor,
 859  * if any, is returned.
 860  *
 861  * We do need to check non-first mbuf for m_aux, since some of existing
 862  * code does not call M_PREPEND properly.
 863  * (example: call to bpf_mtap from drivers)
 864  */
 865 struct mbuf *
 866 m_free(struct mbuf *m)
 867 {
 868         int s;
 869         struct mbuf *n;
 870
 871         s = splimp();
 872         KASSERT(m->m_type != MT_FREE, ("freeing free mbuf"));
 873         mbtypes[m->m_type]--;
 874         if ((m->m_flags & M_PKTHDR) != 0)
 875                 m_tag_delete_chain(m, NULL);
 876         if (m->m_flags & M_EXT) {
 877                 if (m->m_ext.ext_free != NULL) {
 878                         m->m_ext.ext_free(m->m_ext.ext_buf, m->m_ext.ext_size);
 879                 } else {
 880                         _m_mclfree(m->m_ext.ext_buf); /* inlined */
 881                 }
 882         }
 883         n = m->m_next;
 884         m->m_type = MT_FREE;
 885         mbtypes[MT_FREE]++;
 886         m->m_next = mmbfree;
 887         mmbfree = m;
 888         MMBWAKEUP();
 889         splx(s);
 890
 891         return (n);
 892 }
 893
 894 void
 895 m_freem(struct mbuf *m)
 896 {
 897         int s = splimp();
 898
 899         /*
 900          * Try to keep a small pool of mbuf+cluster for quick use in
 901          * device drivers. A good candidate is a M_PKTHDR buffer with
 902          * only one cluster attached. Other mbufs, or those exceeding
 903          * the pool size, are just m_free'd in the usual way.
 904          * The following code makes sure that m_next, m_type,
 905          * m_pkthdr.aux and m_ext.* are properly initialized.
 906          * Other fields in the mbuf are initialized in m_getcl()
 907          * upon allocation.
 908          */
 909         if (mcl_pool_now < mcl_pool_max && m && m->m_next == NULL &&
 910             (m->m_flags & (M_PKTHDR|M_EXT)) == (M_PKTHDR|M_EXT) &&
 911             m->m_type == MT_DATA && M_EXT_WRITABLE(m) ) {
 912                 m_tag_delete_chain(m, NULL);
 913                 m->m_nextpkt = mcl_pool;
 914                 mcl_pool = m;
 915                 mcl_pool_now++;
 916         } else {
 917                 while (m)
 918                         m = m_free(m);
 919         }
 920         splx(s);
 921 }
 922
 923 /*
 924  * Mbuffer utility routines.
 925  */
 926
 927 /*
 928  * Lesser-used path for M_PREPEND:
 929  * allocate new mbuf to prepend to chain,
 930  * copy junk along.
 931  */
 932 struct mbuf *
 933 m_prepend(struct mbuf *m, int len, int how)
 934 {
 935         struct mbuf *mn;
 936
 937         MGET(mn, how, m->m_type);
 938         if (mn == (struct mbuf *)NULL) {
 939                 m_freem(m);
 940                 return ((struct mbuf *)NULL);
 941         }
 942         if (m->m_flags & M_PKTHDR)
 943                 M_MOVE_PKTHDR(mn, m);
 944         mn->m_next = m;
 945         m = mn;
 946         if (len < MHLEN)
 947                 MH_ALIGN(m, len);
 948         m->m_len = len;
 949         return (m);
 950 }
 951
 952 /*
 953  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
 954  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
 955  * The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
 956  * Note that the copy is read-only, because clusters are not copied,
 957  * only their reference counts are incremented.
 958  */
 959 #define MCFail (mbstat.m_mcfail)
 960
 961 struct mbuf *
 962 m_copym(const struct mbuf *m, int off0, int len, int wait)
 963 {
 964         struct mbuf *n, **np;
 965         int off = off0;
 966         struct mbuf *top;
 967         int copyhdr = 0;
 968
 969         KASSERT(off >= 0, ("m_copym, negative off %d", off));
 970         KASSERT(len >= 0, ("m_copym, negative len %d", len));
 971         if (off == 0 && m->m_flags & M_PKTHDR)
 972                 copyhdr = 1;
 973         while (off > 0) {
 974                 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
 975                 if (off < m->m_len)
 976                         break;
 977                 off -= m->m_len;
 978                 m = m->m_next;
 979         }
 980         np = &top;
 981         top = 0;
 982         while (len > 0) {
 983                 if (m == 0) {
 984                         KASSERT(len == M_COPYALL,
 985                             ("m_copym, length > size of mbuf chain"));
 986                         break;
 987                 }
 988                 MGET(n, wait, m->m_type);
 989                 *np = n;
 990                 if (n == 0)
 991                         goto nospace;
 992                 if (copyhdr) {
 993                         if (!m_dup_pkthdr(n, m, wait))
 994                                 goto nospace;
 995                         if (len == M_COPYALL)
 996                                 n->m_pkthdr.len -= off0;
 997                         else
 998                                 n->m_pkthdr.len = len;
 999                         copyhdr = 0;
1000                 }
1001                 n->m_len = min(len, m->m_len - off);
1002                 if (m->m_flags & M_EXT) {
1003                         n->m_data = m->m_data + off;
1004                         if (m->m_ext.ext_ref == NULL) {
1005                                 atomic_add_char(
1006                                     &mclrefcnt[mtocl(m->m_ext.ext_buf)], 1);
1007                         } else {
1008                                 int s = splimp();
1009
1010                                 (*m->m_ext.ext_ref)(m->m_ext.ext_buf,
1011                                     m->m_ext.ext_size);
1012                                 splx(s);
1013                         }
1014                         n->m_ext = m->m_ext;
1015                         n->m_flags |= M_EXT;
1016                 } else
1017                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1018                             (unsigned)n->m_len);
1019                 if (len != M_COPYALL)
1020                         len -= n->m_len;
1021                 off = 0;
1022                 m = m->m_next;
1023                 np = &n->m_next;
1024         }
1025         if (top == 0)
1026                 MCFail++;
1027         return (top);
1028 nospace:
1029         m_freem(top);
1030         MCFail++;
1031         return (0);
1032 }
1033
1034 /*
1035  * Copy an entire packet, including header (which must be present).
1036  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1037  * Note that the copy is read-only, because clusters are not copied,
1038  * only their reference counts are incremented.
1039  * Preserve alignment of the first mbuf so if the creator has left
1040  * some room at the beginning (e.g. for inserting protocol headers)
1041  * the copies also have the room available.
1042  */
1043 struct mbuf *
1044 m_copypacket(struct mbuf *m, int how)
1045 {
1046         struct mbuf *top, *n, *o;
1047
1048         MGET(n, how, m->m_type);
1049         top = n;
1050         if (!n)
1051                 goto nospace;
1052
1053         if (!m_dup_pkthdr(n, m, how))
1054                 goto nospace;
1055         n->m_len = m->m_len;
1056         if (m->m_flags & M_EXT) {
1057                 n->m_data = m->m_data;
1058                 if (m->m_ext.ext_ref == NULL)
1059                         atomic_add_char(&mclrefcnt[mtocl(m->m_ext.ext_buf)], 1);
1060                 else {
1061                         int s = splimp();
1062
1063                         (*m->m_ext.ext_ref)(m->m_ext.ext_buf,
1064                             m->m_ext.ext_size);
1065                         splx(s);
1066                 }
1067                 n->m_ext = m->m_ext;
1068                 n->m_flags |= M_EXT;
1069         } else {
1070                 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1071                 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1072         }
1073
1074         m = m->m_next;
1075         while (m) {
1076                 MGET(o, how, m->m_type);
1077                 if (!o)
1078                         goto nospace;
1079
1080                 n->m_next = o;
1081                 n = n->m_next;
1082
1083                 n->m_len = m->m_len;
1084                 if (m->m_flags & M_EXT) {
1085                         n->m_data = m->m_data;
1086                         if (m->m_ext.ext_ref == NULL) {
1087                                 atomic_add_char(
1088                                     &mclrefcnt[mtocl(m->m_ext.ext_buf)], 1);
1089                         } else {
1090                                 int s = splimp();
1091
1092                                 (*m->m_ext.ext_ref)(m->m_ext.ext_buf,
1093                                     m->m_ext.ext_size);
1094                                 splx(s);
1095                         }
1096                         n->m_ext = m->m_ext;
1097                         n->m_flags |= M_EXT;
1098                 } else {
1099                         bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1100                 }
1101
1102                 m = m->m_next;
1103         }
1104         return top;
1105 nospace:
1106         m_freem(top);
1107         MCFail++;
1108         return 0;
1109 }
1110
1111 /*
1112  * Copy data from an mbuf chain starting "off" bytes from the beginning,
1113  * continuing for "len" bytes, into the indicated buffer.
1114  */
1115 void
1116 m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
1117 {
1118         unsigned count;
1119
1120         KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1121         KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1122         while (off > 0) {
1123                 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1124                 if (off < m->m_len)
1125                         break;
1126                 off -= m->m_len;
1127                 m = m->m_next;
1128         }
1129         while (len > 0) {
1130                 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1131                 count = min(m->m_len - off, len);
1132                 bcopy(mtod(m, caddr_t) + off, cp, count);
1133                 len -= count;
1134                 cp += count;
1135                 off = 0;
1136                 m = m->m_next;
1137         }
1138 }
1139
1140 /*
1141  * Copy a packet header mbuf chain into a completely new chain, including
1142  * copying any mbuf clusters.  Use this instead of m_copypacket() when
1143  * you need a writable copy of an mbuf chain.
1144  */
1145 struct mbuf *
1146 m_dup(struct mbuf *m, int how)
1147 {
1148         struct mbuf **p, *top = NULL;
1149         int remain, moff, nsize;
1150
1151         /* Sanity check */
1152         if (m == NULL)
1153                 return (0);
1154         KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __FUNCTION__));
1155
1156         /* While there's more data, get a new mbuf, tack it on, and fill it */
1157         remain = m->m_pkthdr.len;
1158         moff = 0;
1159         p = &top;
1160         while (remain > 0 || top == NULL) {     /* allow m->m_pkthdr.len == 0 */
1161                 struct mbuf *n;
1162
1163                 /* Get the next new mbuf */
1164                 MGET(n, how, m->m_type);
1165                 if (n == NULL)
1166                         goto nospace;
1167                 if (top == NULL) {              /* first one, must be PKTHDR */
1168                         if (!m_dup_pkthdr(n, m, how))
1169                                 goto nospace;
1170                         nsize = MHLEN;
1171                 } else                          /* not the first one */
1172                         nsize = MLEN;
1173                 if (remain >= MINCLSIZE) {
1174                         MCLGET(n, how);
1175                         if ((n->m_flags & M_EXT) == 0) {
1176                                 (void)m_free(n);
1177                                 goto nospace;
1178                         }
1179                         nsize = MCLBYTES;
1180                 }
1181                 n->m_len = 0;
1182
1183                 /* Link it into the new chain */
1184                 *p = n;
1185                 p = &n->m_next;
1186
1187                 /* Copy data from original mbuf(s) into new mbuf */
1188                 while (n->m_len < nsize && m != NULL) {
1189                         int chunk = min(nsize - n->m_len, m->m_len - moff);
1190
1191                         bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1192                         moff += chunk;
1193                         n->m_len += chunk;
1194                         remain -= chunk;
1195                         if (moff == m->m_len) {
1196                                 m = m->m_next;
1197                                 moff = 0;
1198                         }
1199                 }
1200
1201                 /* Check correct total mbuf length */
1202                 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
1203                         ("%s: bogus m_pkthdr.len", __FUNCTION__));
1204         }
1205         return (top);
1206
1207 nospace:
1208         m_freem(top);
1209         MCFail++;
1210         return (0);
1211 }
1212
1213 /*
1214  * Concatenate mbuf chain n to m.
1215  * Both chains must be of the same type (e.g. MT_DATA).
1216  * Any m_pkthdr is not updated.
1217  */
1218 void
1219 m_cat(struct mbuf *m, struct mbuf *n)
1220 {
1221         while (m->m_next)
1222                 m = m->m_next;
1223         while (n) {
1224                 if (m->m_flags & M_EXT ||
1225                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1226                         /* just join the two chains */
1227                         m->m_next = n;
1228                         return;
1229                 }
1230                 /* splat the data from one into the other */
1231                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1232                     (u_int)n->m_len);
1233                 m->m_len += n->m_len;
1234                 n = m_free(n);
1235         }
1236 }
1237
1238 void
1239 m_adj(struct mbuf *mp, int req_len)
1240 {
1241         int len = req_len;
1242         struct mbuf *m;
1243         int count;
1244
1245         if ((m = mp) == NULL)
1246                 return;
1247         if (len >= 0) {
1248                 /*
1249                  * Trim from head.
1250                  */
1251                 while (m != NULL && len > 0) {
1252                         if (m->m_len <= len) {
1253                                 len -= m->m_len;
1254                                 m->m_len = 0;
1255                                 m = m->m_next;
1256                         } else {
1257                                 m->m_len -= len;
1258                                 m->m_data += len;
1259                                 len = 0;
1260                         }
1261                 }
1262                 m = mp;
1263                 if (mp->m_flags & M_PKTHDR)
1264                         m->m_pkthdr.len -= (req_len - len);
1265         } else {
1266                 /*
1267                  * Trim from tail.  Scan the mbuf chain,
1268                  * calculating its length and finding the last mbuf.
1269                  * If the adjustment only affects this mbuf, then just
1270                  * adjust and return.  Otherwise, rescan and truncate
1271                  * after the remaining size.
1272                  */
1273                 len = -len;
1274                 count = 0;
1275                 for (;;) {
1276                         count += m->m_len;
1277                         if (m->m_next == (struct mbuf *)0)
1278                                 break;
1279                         m = m->m_next;
1280                 }
1281                 if (m->m_len >= len) {
1282                         m->m_len -= len;
1283                         if (mp->m_flags & M_PKTHDR)
1284                                 mp->m_pkthdr.len -= len;
1285                         return;
1286                 }
1287                 count -= len;
1288                 if (count < 0)
1289                         count = 0;
1290                 /*
1291                  * Correct length for chain is "count".
1292                  * Find the mbuf with last data, adjust its length,
1293                  * and toss data from remaining mbufs on chain.
1294                  */
1295                 m = mp;
1296                 if (m->m_flags & M_PKTHDR)
1297                         m->m_pkthdr.len = count;
1298                 for (; m; m = m->m_next) {
1299                         if (m->m_len >= count) {
1300                                 m->m_len = count;
1301                                 break;
1302                         }
1303                         count -= m->m_len;
1304                 }
1305                 while (m->m_next)
1306                         (m = m->m_next) ->m_len = 0;
1307         }
1308 }
1309
1310 /*
1311  * Rearange an mbuf chain so that len bytes are contiguous
1312  * and in the data area of an mbuf (so that mtod will work for a structure
1313  * of size len).  Returns the resulting mbuf chain on success, frees it and
1314  * returns null on failure.  If there is room, it will add up to
1315  * max_protohdr-len extra bytes to the contiguous region in an attempt to
1316  * avoid being called next time.
1317  */
1318 #define MPFail (mbstat.m_mpfail)
1319
1320 struct mbuf *
1321 m_pullup(struct mbuf *n, int len)
1322 {
1323         struct mbuf *m;
1324         int count;
1325         int space;
1326
1327         /*
1328          * If first mbuf has no cluster, and has room for len bytes
1329          * without shifting current data, pullup into it,
1330          * otherwise allocate a new mbuf to prepend to the chain.
1331          */
1332         if ((n->m_flags & M_EXT) == 0 &&
1333             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
1334                 if (n->m_len >= len)
1335                         return (n);
1336                 m = n;
1337                 n = n->m_next;
1338                 len -= m->m_len;
1339         } else {
1340                 if (len > MHLEN)
1341                         goto bad;
1342                 MGET(m, MB_DONTWAIT, n->m_type);
1343                 if (m == 0)
1344                         goto bad;
1345                 m->m_len = 0;
1346                 if (n->m_flags & M_PKTHDR)
1347                         M_MOVE_PKTHDR(m, n);
1348         }
1349         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
1350         do {
1351                 count = min(min(max(len, max_protohdr), space), n->m_len);
1352                 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1353                   (unsigned)count);
1354                 len -= count;
1355                 m->m_len += count;
1356                 n->m_len -= count;
1357                 space -= count;
1358                 if (n->m_len)
1359                         n->m_data += count;
1360                 else
1361                         n = m_free(n);
1362         } while (len > 0 && n);
1363         if (len > 0) {
1364                 (void) m_free(m);
1365                 goto bad;
1366         }
1367         m->m_next = n;
1368         return (m);
1369 bad:
1370         m_freem(n);
1371         MPFail++;
1372         return (0);
1373 }
1374
1375 /*
1376  * Partition an mbuf chain in two pieces, returning the tail --
1377  * all but the first len0 bytes.  In case of failure, it returns NULL and
1378  * attempts to restore the chain to its original state.
1379  *
1380  * Note that the resulting mbufs might be read-only, because the new
1381  * mbuf can end up sharing an mbuf cluster with the original mbuf if
1382  * the "breaking point" happens to lie within a cluster mbuf. Use the
1383  * M_WRITABLE() macro to check for this case.
1384  */
1385 struct mbuf *
1386 m_split(struct mbuf *m0, int len0, int wait)
1387 {
1388         struct mbuf *m, *n;
1389         unsigned len = len0, remain;
1390
1391         for (m = m0; m && len > m->m_len; m = m->m_next)
1392                 len -= m->m_len;
1393         if (m == 0)
1394                 return (0);
1395         remain = m->m_len - len;
1396         if (m0->m_flags & M_PKTHDR) {
1397                 MGETHDR(n, wait, m0->m_type);
1398                 if (n == 0)
1399                         return (0);
1400                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
1401                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
1402                 m0->m_pkthdr.len = len0;
1403                 if (m->m_flags & M_EXT)
1404                         goto extpacket;
1405                 if (remain > MHLEN) {
1406                         /* m can't be the lead packet */
1407                         MH_ALIGN(n, 0);
1408                         n->m_next = m_split(m, len, wait);
1409                         if (n->m_next == 0) {
1410                                 (void) m_free(n);
1411                                 return (0);
1412                         } else {
1413                                 n->m_len = 0;
1414                                 return (n);
1415                         }
1416                 } else
1417                         MH_ALIGN(n, remain);
1418         } else if (remain == 0) {
1419                 n = m->m_next;
1420                 m->m_next = 0;
1421                 return (n);
1422         } else {
1423                 MGET(n, wait, m->m_type);
1424                 if (n == 0)
1425                         return (0);
1426                 M_ALIGN(n, remain);
1427         }
1428 extpacket:
1429         if (m->m_flags & M_EXT) {
1430                 n->m_flags |= M_EXT;
1431                 n->m_ext = m->m_ext;
1432                 if (m->m_ext.ext_ref == NULL)
1433                         atomic_add_char(&mclrefcnt[mtocl(m->m_ext.ext_buf)], 1);
1434                 else {
1435                         int s = splimp();
1436
1437                         (*m->m_ext.ext_ref)(m->m_ext.ext_buf,
1438                             m->m_ext.ext_size);
1439                         splx(s);
1440                 }
1441                 n->m_data = m->m_data + len;
1442         } else {
1443                 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
1444         }
1445         n->m_len = remain;
1446         m->m_len = len;
1447         n->m_next = m->m_next;
1448         m->m_next = 0;
1449         return (n);
1450 }
1451 /*
1452  * Routine to copy from device local memory into mbufs.
1453  */
1454 struct mbuf *
1455 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
1456         void (*copy) (char *from, caddr_t to, u_int len))
1457 {
1458         struct mbuf *m;
1459         struct mbuf *top = 0, **mp = &top;
1460         int off = off0, len;
1461         char *cp;
1462         char *epkt;
1463
1464         cp = buf;
1465         epkt = cp + totlen;
1466         if (off) {
1467                 cp += off + 2 * sizeof(u_short);
1468                 totlen -= 2 * sizeof(u_short);
1469         }
1470         MGETHDR(m, MB_DONTWAIT, MT_DATA);
1471         if (m == 0)
1472                 return (0);
1473         m->m_pkthdr.rcvif = ifp;
1474         m->m_pkthdr.len = totlen;
1475         m->m_len = MHLEN;
1476
1477         while (totlen > 0) {
1478                 if (top) {
1479                         MGET(m, MB_DONTWAIT, MT_DATA);
1480                         if (m == 0) {
1481                                 m_freem(top);
1482                                 return (0);
1483                         }
1484                         m->m_len = MLEN;
1485                 }
1486                 len = min(totlen, epkt - cp);
1487                 if (len >= MINCLSIZE) {
1488                         MCLGET(m, MB_DONTWAIT);
1489                         if (m->m_flags & M_EXT)
1490                                 m->m_len = len = min(len, MCLBYTES);
1491                         else
1492                                 len = m->m_len;
1493                 } else {
1494                         /*
1495                          * Place initial small packet/header at end of mbuf.
1496                          */
1497                         if (len < m->m_len) {
1498                                 if (top == 0 && len + max_linkhdr <= m->m_len)
1499                                         m->m_data += max_linkhdr;
1500                                 m->m_len = len;
1501                         } else
1502                                 len = m->m_len;
1503                 }
1504                 if (copy)
1505                         copy(cp, mtod(m, caddr_t), (unsigned)len);
1506                 else
1507                         bcopy(cp, mtod(m, caddr_t), (unsigned)len);
1508                 cp += len;
1509                 *mp = m;
1510                 mp = &m->m_next;
1511                 totlen -= len;
1512                 if (cp == epkt)
1513                         cp = buf;
1514         }
1515         return (top);
1516 }
1517
1518 /*
1519  * Copy data from a buffer back into the indicated mbuf chain,
1520  * starting "off" bytes from the beginning, extending the mbuf
1521  * chain if necessary.
1522  */
1523 void
1524 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
1525 {
1526         int mlen;
1527         struct mbuf *m = m0, *n;
1528         int totlen = 0;
1529
1530         if (m0 == 0)
1531                 return;
1532         while (off > (mlen = m->m_len)) {
1533                 off -= mlen;
1534                 totlen += mlen;
1535                 if (m->m_next == 0) {
1536                         n = m_getclr(MB_DONTWAIT, m->m_type);
1537                         if (n == 0)
1538                                 goto out;
1539                         n->m_len = min(MLEN, len + off);
1540                         m->m_next = n;
1541                 }
1542                 m = m->m_next;
1543         }
1544         while (len > 0) {
1545                 mlen = min (m->m_len - off, len);
1546                 bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
1547                 cp += mlen;
1548                 len -= mlen;
1549                 mlen += off;
1550                 off = 0;
1551                 totlen += mlen;
1552                 if (len == 0)
1553                         break;
1554                 if (m->m_next == 0) {
1555                         n = m_get(MB_DONTWAIT, m->m_type);
1556                         if (n == 0)
1557                                 break;
1558                         n->m_len = min(MLEN, len);
1559                         m->m_next = n;
1560                 }
1561                 m = m->m_next;
1562         }
1563 out:    if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
1564                 m->m_pkthdr.len = totlen;
1565 }
1566
1567 void
1568 m_print(const struct mbuf *m)
1569 {
1570         int len;
1571         const struct mbuf *m2;
1572
1573         len = m->m_pkthdr.len;
1574         m2 = m;
1575         while (len) {
1576                 printf("%p %*D\n", m2, m2->m_len, (u_char *)m2->m_data, "-");
1577                 len -= m2->m_len;
1578                 m2 = m2->m_next;
1579         }
1580         return;
1581 }
1582
1583 /*
1584  * "Move" mbuf pkthdr from "from" to "to".
1585  * "from" must have M_PKTHDR set, and "to" must be empty.
1586  */
1587 void
1588 m_move_pkthdr(struct mbuf *to, struct mbuf *from)
1589 {
1590         KASSERT((to->m_flags & M_EXT) == 0, ("m_move_pkthdr: to has cluster"));
1591
1592         to->m_flags = from->m_flags & M_COPYFLAGS;
1593         to->m_data = to->m_pktdat;
1594         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
1595         SLIST_INIT(&from->m_pkthdr.tags);       /* purge tags from src */
1596         from->m_flags &= ~M_PKTHDR;
1597 }
1598
1599 /*
1600  * Duplicate "from"'s mbuf pkthdr in "to".
1601  * "from" must have M_PKTHDR set, and "to" must be empty.
1602  * In particular, this does a deep copy of the packet tags.
1603  */
1604 int
1605 m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
1606 {
1607         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
1608         if ((to->m_flags & M_EXT) == 0)
1609                 to->m_data = to->m_pktdat;
1610         to->m_pkthdr = from->m_pkthdr;
1611         SLIST_INIT(&to->m_pkthdr.tags);
1612         return (m_tag_copy_chain(to, from, how));
1613 }
1614
1615 /*
1616  * Defragment a mbuf chain, returning the shortest possible
1617  * chain of mbufs and clusters.  If allocation fails and
1618  * this cannot be completed, NULL will be returned, but
1619  * the passed in chain will be unchanged.  Upon success,
1620  * the original chain will be freed, and the new chain
1621  * will be returned.
1622  *
1623  * If a non-packet header is passed in, the original
1624  * mbuf (chain?) will be returned unharmed.
1625  */
1626 struct mbuf *
1627 m_defrag(struct mbuf *m0, int how)
1628 {
1629         struct mbuf     *m_new = NULL, *m_final = NULL;
1630         int             progress = 0, length;
1631
1632         if (!(m0->m_flags & M_PKTHDR))
1633                 return (m0);
1634
1635 #ifdef MBUF_STRESS_TEST
1636         if (m_defragrandomfailures) {
1637                 int temp = arc4random() & 0xff;
1638                 if (temp == 0xba)
1639                         goto nospace;
1640         }
1641 #endif
1642
1643         if (m0->m_pkthdr.len > MHLEN)
1644                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
1645         else
1646                 m_final = m_gethdr(how, MT_DATA);
1647
1648         if (m_final == NULL)
1649                 goto nospace;
1650
1651         if (m_dup_pkthdr(m_final, m0, how) == NULL)
1652                 goto nospace;
1653
1654         m_new = m_final;
1655
1656         while (progress < m0->m_pkthdr.len) {
1657                 length = m0->m_pkthdr.len - progress;
1658                 if (length > MCLBYTES)
1659                         length = MCLBYTES;
1660
1661                 if (m_new == NULL) {
1662                         if (length > MLEN)
1663                                 m_new = m_getcl(how, MT_DATA, 0);
1664                         else
1665                                 m_new = m_get(how, MT_DATA);
1666                         if (m_new == NULL)
1667                                 goto nospace;
1668                 }
1669
1670                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
1671                 progress += length;
1672                 m_new->m_len = length;
1673                 if (m_new != m_final)
1674                         m_cat(m_final, m_new);
1675                 m_new = NULL;
1676         }
1677         if (m0->m_next == NULL)
1678                 m_defraguseless++;
1679         m_freem(m0);
1680         m0 = m_final;
1681         m_defragpackets++;
1682         m_defragbytes += m0->m_pkthdr.len;
1683         return (m0);
1684 nospace:
1685         m_defragfailure++;
1686         if (m_new)
1687                 m_free(m_new);
1688         if (m_final)
1689                 m_freem(m_final);
1690         return (NULL);
1691 }
1692
1693 /*
1694  * Move data from uio into mbufs.
1695  * A length of zero means copy the whole uio.
1696  */
1697 struct mbuf *
1698 m_uiomove(struct uio *uio, int wait, int len0)
1699 {
1700         struct mbuf *head;              /* result mbuf chain */
1701         struct mbuf *m;                 /* current working mbuf */
1702         struct mbuf **mp;
1703         int resid, datalen, error;
1704
1705         resid = (len0 == 0) ? uio->uio_resid : min(len0, uio->uio_resid);
1706
1707         head = NULL;
1708         mp = &head;
1709         do {
1710                 if (resid > MHLEN) {
1711                         m = m_getcl(wait, MT_DATA, head == NULL ? M_PKTHDR : 0);
1712                         if (m == NULL)
1713                                 goto failed;
1714                         if (m->m_flags & M_PKTHDR)
1715                                 m->m_pkthdr.len = 0;
1716                 } else {
1717                         if (head == NULL) {
1718                                 MGETHDR(m, wait, MT_DATA);
1719                                 if (m == NULL)
1720                                         goto failed;
1721                                 m->m_pkthdr.len = 0;
1722                                 /* Leave room for protocol headers. */
1723                                 if (resid < MHLEN)
1724                                         MH_ALIGN(m, resid);
1725                         } else {
1726                                 MGET(m, wait, MT_DATA);
1727                                 if (m == NULL)
1728                                         goto failed;
1729                         }
1730                 }
1731                 datalen = min(MCLBYTES, resid);
1732                 error = uiomove(mtod(m, caddr_t), datalen, uio);
1733                 if (error) {
1734                         m_free(m);
1735                         goto failed;
1736                 }
1737                 m->m_len = datalen;
1738                 *mp = m;
1739                 mp = &m->m_next;
1740                 head->m_pkthdr.len += datalen;
1741                 resid -= datalen;
1742         } while (resid > 0);
1743
1744         return (head);
1745
1746 failed:
1747         if (head)
1748                 m_freem(head);
1749         return (NULL);
1750 }