mbuf: Don't allow mbuf thresholds be configured too low.
[dragonfly.git] / sys / kern / uipc_mbuf.c
CommitLineData
984263bc 1/*
5bd48c1d
MD
2 * (MPSAFE)
3 *
0c33f36d 4 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
66d6c637
JH
5 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
6 *
7 * This code is derived from software contributed to The DragonFly Project
8 * by Jeffrey M. Hsu.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
66d6c637 36/*
984263bc
MD
37 * Copyright (c) 1982, 1986, 1988, 1991, 1993
38 * The Regents of the University of California. All rights reserved.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
dc71b7ab 48 * 3. Neither the name of the University nor the names of its contributors
984263bc
MD
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
8a3125c6 64 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
984263bc
MD
65 * $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
66 */
67
68#include "opt_param.h"
69#include "opt_mbuf_stress_test.h"
70#include <sys/param.h>
71#include <sys/systm.h>
4e23f366 72#include <sys/file.h>
984263bc
MD
73#include <sys/malloc.h>
74#include <sys/mbuf.h>
75#include <sys/kernel.h>
76#include <sys/sysctl.h>
77#include <sys/domain.h>
7b6f875f 78#include <sys/objcache.h>
e9fa4b60 79#include <sys/tree.h>
984263bc 80#include <sys/protosw.h>
0c33f36d 81#include <sys/uio.h>
ef0fdad1 82#include <sys/thread.h>
a2a5ad0d 83#include <sys/globaldata.h>
5bd48c1d 84
90775e29 85#include <sys/thread2.h>
5bd48c1d 86#include <sys/spinlock2.h>
984263bc 87
1d16b2b5 88#include <machine/atomic.h>
e54488bb 89#include <machine/limits.h>
1d16b2b5 90
984263bc
MD
91#include <vm/vm.h>
92#include <vm/vm_kern.h>
93#include <vm/vm_extern.h>
94
95#ifdef INVARIANTS
96#include <machine/cpu.h>
97#endif
98
90775e29
MD
99/*
100 * mbuf cluster meta-data
101 */
7b6f875f 102struct mbcluster {
90775e29
MD
103 int32_t mcl_refs;
104 void *mcl_data;
7b6f875f 105};
90775e29 106
e9fa4b60
MD
107/*
108 * mbuf tracking for debugging purposes
109 */
110#ifdef MBUF_DEBUG
111
112static MALLOC_DEFINE(M_MTRACK, "mtrack", "mtrack");
113
114struct mbctrack;
115RB_HEAD(mbuf_rb_tree, mbtrack);
116RB_PROTOTYPE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *);
117
118struct mbtrack {
119 RB_ENTRY(mbtrack) rb_node;
120 int trackid;
121 struct mbuf *m;
122};
123
124static int
125mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2)
126{
127 if (mb1->m < mb2->m)
128 return(-1);
129 if (mb1->m > mb2->m)
130 return(1);
131 return(0);
132}
133
134RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m);
135
136struct mbuf_rb_tree mbuf_track_root;
ba87a4ab 137static struct spinlock mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin, "mbuf_track_spin");
e9fa4b60
MD
138
139static void
140mbuftrack(struct mbuf *m)
141{
142 struct mbtrack *mbt;
143
e9fa4b60 144 mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO);
5bd48c1d 145 spin_lock(&mbuf_track_spin);
e9fa4b60 146 mbt->m = m;
5bd48c1d
MD
147 if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) {
148 spin_unlock(&mbuf_track_spin);
ed20d0e3 149 panic("mbuftrack: mbuf %p already being tracked", m);
5bd48c1d
MD
150 }
151 spin_unlock(&mbuf_track_spin);
e9fa4b60
MD
152}
153
154static void
155mbufuntrack(struct mbuf *m)
156{
157 struct mbtrack *mbt;
158
5bd48c1d 159 spin_lock(&mbuf_track_spin);
e9fa4b60
MD
160 mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
161 if (mbt == NULL) {
5bd48c1d 162 spin_unlock(&mbuf_track_spin);
ed20d0e3 163 panic("mbufuntrack: mbuf %p was not tracked", m);
e9fa4b60
MD
164 } else {
165 mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt);
6cef7136 166 spin_unlock(&mbuf_track_spin);
e9fa4b60
MD
167 kfree(mbt, M_MTRACK);
168 }
e9fa4b60
MD
169}
170
171void
172mbuftrackid(struct mbuf *m, int trackid)
173{
174 struct mbtrack *mbt;
175 struct mbuf *n;
176
5bd48c1d 177 spin_lock(&mbuf_track_spin);
e9fa4b60
MD
178 while (m) {
179 n = m->m_nextpkt;
180 while (m) {
181 mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m);
5bd48c1d
MD
182 if (mbt == NULL) {
183 spin_unlock(&mbuf_track_spin);
184 panic("mbuftrackid: mbuf %p not tracked", m);
185 }
186 mbt->trackid = trackid;
e9fa4b60
MD
187 m = m->m_next;
188 }
189 m = n;
190 }
5bd48c1d 191 spin_unlock(&mbuf_track_spin);
e9fa4b60
MD
192}
193
194static int
195mbuftrack_callback(struct mbtrack *mbt, void *arg)
196{
197 struct sysctl_req *req = arg;
198 char buf[64];
199 int error;
200
201 ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid);
202
5bd48c1d 203 spin_unlock(&mbuf_track_spin);
e9fa4b60 204 error = SYSCTL_OUT(req, buf, strlen(buf));
5bd48c1d 205 spin_lock(&mbuf_track_spin);
e9fa4b60
MD
206 if (error)
207 return(-error);
208 return(0);
209}
210
211static int
212mbuftrack_show(SYSCTL_HANDLER_ARGS)
213{
214 int error;
215
5bd48c1d 216 spin_lock(&mbuf_track_spin);
e9fa4b60
MD
217 error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL,
218 mbuftrack_callback, req);
5bd48c1d 219 spin_unlock(&mbuf_track_spin);
e9fa4b60
MD
220 return (-error);
221}
222SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING,
223 0, 0, mbuftrack_show, "A", "Show all in-use mbufs");
224
225#else
226
227#define mbuftrack(m)
228#define mbufuntrack(m)
229
230#endif
231
7b6f875f 232static void mbinit(void *);
f3f3eadb 233SYSINIT(mbuf, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, mbinit, NULL);
984263bc 234
a8824a1d
SZ
235struct mbtypes_stat {
236 u_long stats[MT_NTYPES];
237} __cachealign;
238
239static struct mbtypes_stat mbtypes[SMP_MAXCPU];
90775e29 240
7a82f541 241static struct mbstat mbstat[SMP_MAXCPU] __cachealign;
984263bc
MD
242int max_linkhdr;
243int max_protohdr;
244int max_hdr;
245int max_datalen;
246int m_defragpackets;
247int m_defragbytes;
248int m_defraguseless;
249int m_defragfailure;
250#ifdef MBUF_STRESS_TEST
251int m_defragrandomfailures;
252#endif
253
7b6f875f 254struct objcache *mbuf_cache, *mbufphdr_cache;
94eaee9a 255struct objcache *mclmeta_cache, *mjclmeta_cache;
7b6f875f 256struct objcache *mbufcluster_cache, *mbufphdrcluster_cache;
94eaee9a 257struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache;
7b6f875f 258
b11c11e9
MD
259struct lock mbupdate_lk = LOCK_INITIALIZER("mbupdate", 0, 0);
260
8033b958
SZ
261int nmbclusters;
262static int nmbjclusters;
263int nmbufs;
984263bc 264
d5b73e64
MD
265static int mjclph_cachefrac;
266static int mjcl_cachefrac;
fa8f5efb
SZ
267static int mclph_cachefrac;
268static int mcl_cachefrac;
269
984263bc 270SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
093e85dc 271 &max_linkhdr, 0, "Max size of a link-level header");
984263bc 272SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
093e85dc
SG
273 &max_protohdr, 0, "Max size of a protocol header");
274SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0,
275 "Max size of link+protocol headers");
984263bc 276SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
093e85dc 277 &max_datalen, 0, "Max data payload size without headers");
984263bc 278SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
093e85dc 279 &mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations");
4c1e2509
JT
280static int do_mbstat(SYSCTL_HANDLER_ARGS);
281
282SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD,
093e85dc 283 0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics");
4c1e2509
JT
284
285static int do_mbtypes(SYSCTL_HANDLER_ARGS);
286
287SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtypes, CTLTYPE_ULONG|CTLFLAG_RD,
288 0, 0, do_mbtypes, "LU", "");
289
290static int
291do_mbstat(SYSCTL_HANDLER_ARGS)
292{
293 struct mbstat mbstat_total;
294 struct mbstat *mbstat_totalp;
295 int i;
296
297 bzero(&mbstat_total, sizeof(mbstat_total));
298 mbstat_totalp = &mbstat_total;
299
300 for (i = 0; i < ncpus; i++)
301 {
302 mbstat_total.m_mbufs += mbstat[i].m_mbufs;
303 mbstat_total.m_clusters += mbstat[i].m_clusters;
c83e573d 304 mbstat_total.m_jclusters += mbstat[i].m_jclusters;
4c1e2509
JT
305 mbstat_total.m_clfree += mbstat[i].m_clfree;
306 mbstat_total.m_drops += mbstat[i].m_drops;
307 mbstat_total.m_wait += mbstat[i].m_wait;
308 mbstat_total.m_drain += mbstat[i].m_drain;
309 mbstat_total.m_mcfail += mbstat[i].m_mcfail;
310 mbstat_total.m_mpfail += mbstat[i].m_mpfail;
311
312 }
313 /*
314 * The following fields are not cumulative fields so just
315 * get their values once.
316 */
317 mbstat_total.m_msize = mbstat[0].m_msize;
318 mbstat_total.m_mclbytes = mbstat[0].m_mclbytes;
319 mbstat_total.m_minclsize = mbstat[0].m_minclsize;
320 mbstat_total.m_mlen = mbstat[0].m_mlen;
321 mbstat_total.m_mhlen = mbstat[0].m_mhlen;
322
323 return(sysctl_handle_opaque(oidp, mbstat_totalp, sizeof(mbstat_total), req));
324}
325
326static int
327do_mbtypes(SYSCTL_HANDLER_ARGS)
328{
329 u_long totals[MT_NTYPES];
330 int i, j;
331
332 for (i = 0; i < MT_NTYPES; i++)
333 totals[i] = 0;
334
335 for (i = 0; i < ncpus; i++)
336 {
337 for (j = 0; j < MT_NTYPES; j++)
a8824a1d 338 totals[j] += mbtypes[i].stats[j];
4c1e2509
JT
339 }
340
341 return(sysctl_handle_opaque(oidp, totals, sizeof(totals), req));
342}
18c48b9c
MD
343
344/*
b11c11e9
MD
345 * The variables may be set as boot-time tunables or live. Setting these
346 * values too low can deadlock your network. Network interfaces may also
347 * adjust nmbclusters and/or nmbjclusters to account for preloading the
348 * hardware rings.
18c48b9c 349 */
b11c11e9
MD
350static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS);
351static int sysctl_nmbjclusters(SYSCTL_HANDLER_ARGS);
352static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS);
353SYSCTL_PROC(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLTYPE_INT | CTLFLAG_RW,
354 0, 0, sysctl_nmbclusters, "I",
355 "Maximum number of mbuf clusters available");
356SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjclusters, CTLTYPE_INT | CTLFLAG_RW,
357 0, 0, sysctl_nmbjclusters, "I",
fa8f5efb 358 "Maximum number of mbuf jclusters available");
b11c11e9
MD
359SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT | CTLFLAG_RW,
360 0, 0, sysctl_nmbufs, "I",
361 "Maximum number of mbufs available");
362
d5b73e64
MD
363SYSCTL_INT(_kern_ipc, OID_AUTO, mjclph_cachefrac, CTLFLAG_RD,
364 &mjclph_cachefrac, 0,
365 "Fraction of cacheable mbuf jclusters w/ pkthdr");
366SYSCTL_INT(_kern_ipc, OID_AUTO, mjcl_cachefrac, CTLFLAG_RD,
367 &mjcl_cachefrac, 0,
368 "Fraction of cacheable mbuf jclusters");
fa8f5efb
SZ
369SYSCTL_INT(_kern_ipc, OID_AUTO, mclph_cachefrac, CTLFLAG_RD,
370 &mclph_cachefrac, 0,
371 "Fraction of cacheable mbuf clusters w/ pkthdr");
372SYSCTL_INT(_kern_ipc, OID_AUTO, mcl_cachefrac, CTLFLAG_RD,
373 &mcl_cachefrac, 0, "Fraction of cacheable mbuf clusters");
7b6f875f 374
984263bc 375SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
093e85dc 376 &m_defragpackets, 0, "Number of defragment packets");
984263bc 377SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
093e85dc 378 &m_defragbytes, 0, "Number of defragment bytes");
984263bc 379SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
093e85dc 380 &m_defraguseless, 0, "Number of useless defragment mbuf chain operations");
984263bc 381SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
093e85dc 382 &m_defragfailure, 0, "Number of failed defragment mbuf chain operations");
984263bc
MD
383#ifdef MBUF_STRESS_TEST
384SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
385 &m_defragrandomfailures, 0, "");
386#endif
387
90775e29
MD
388static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
389static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
7b6f875f 390static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
90775e29
MD
391
392static void m_reclaim (void);
90775e29
MD
393static void m_mclref(void *arg);
394static void m_mclfree(void *arg);
c83e573d 395static void m_mjclfree(void *arg);
984263bc 396
4e23f366
MD
397/*
398 * NOTE: Default NMBUFS must take into account a possible DOS attack
399 * using fd passing on unix domain sockets.
400 */
984263bc
MD
401#ifndef NMBCLUSTERS
402#define NMBCLUSTERS (512 + maxusers * 16)
403#endif
d5b73e64
MD
404#ifndef MJCLPH_CACHEFRAC
405#define MJCLPH_CACHEFRAC 16
406#endif
407#ifndef MJCL_CACHEFRAC
408#define MJCL_CACHEFRAC 4
409#endif
fa8f5efb
SZ
410#ifndef MCLPH_CACHEFRAC
411#define MCLPH_CACHEFRAC 16
412#endif
413#ifndef MCL_CACHEFRAC
414#define MCL_CACHEFRAC 4
415#endif
8033b958 416#ifndef NMBJCLUSTERS
c83e573d 417#define NMBJCLUSTERS (NMBCLUSTERS / 2)
8033b958 418#endif
984263bc 419#ifndef NMBUFS
4e23f366 420#define NMBUFS (nmbclusters * 2 + maxfiles)
984263bc
MD
421#endif
422
e9c1cc61
SZ
423#define NMBCLUSTERS_MIN (NMBCLUSTERS / 2)
424#define NMBJCLUSTERS_MIN (NMBJCLUSTERS / 2)
425#define NMBUFS_MIN ((NMBCLUSTERS * 2 + maxfiles) / 2)
426
984263bc
MD
427/*
428 * Perform sanity checks of tunables declared above.
429 */
430static void
431tunable_mbinit(void *dummy)
432{
984263bc
MD
433 /*
434 * This has to be done before VM init.
435 */
436 nmbclusters = NMBCLUSTERS;
437 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
d5b73e64
MD
438 mjclph_cachefrac = MJCLPH_CACHEFRAC;
439 TUNABLE_INT_FETCH("kern.ipc.mjclph_cachefrac", &mjclph_cachefrac);
440 mjcl_cachefrac = MJCL_CACHEFRAC;
441 TUNABLE_INT_FETCH("kern.ipc.mjcl_cachefrac", &mjcl_cachefrac);
fa8f5efb
SZ
442 mclph_cachefrac = MCLPH_CACHEFRAC;
443 TUNABLE_INT_FETCH("kern.ipc.mclph_cachefrac", &mclph_cachefrac);
444 mcl_cachefrac = MCL_CACHEFRAC;
445 TUNABLE_INT_FETCH("kern.ipc.mcl_cachefrac", &mcl_cachefrac);
446
d5b73e64
MD
447 /*
448 * WARNING! each mcl cache feeds two mbuf caches, so the minimum
449 * cachefrac is 2. For safety, use 3.
450 */
451 if (mjclph_cachefrac < 3)
452 mjclph_cachefrac = 3;
453 if (mjcl_cachefrac < 3)
454 mjcl_cachefrac = 3;
455 if (mclph_cachefrac < 3)
456 mclph_cachefrac = 3;
457 if (mcl_cachefrac < 3)
458 mcl_cachefrac = 3;
459
8033b958
SZ
460 nmbjclusters = NMBJCLUSTERS;
461 TUNABLE_INT_FETCH("kern.ipc.nmbjclusters", &nmbjclusters);
fa8f5efb 462
984263bc
MD
463 nmbufs = NMBUFS;
464 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
fa8f5efb 465
984263bc
MD
466 /* Sanity checks */
467 if (nmbufs < nmbclusters * 2)
468 nmbufs = nmbclusters * 2;
984263bc 469}
ba39e2e0
MD
470SYSINIT(tunable_mbinit, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
471 tunable_mbinit, NULL);
984263bc 472
b11c11e9
MD
473/*
474 * Sysctl support to update nmbclusters, nmbjclusters, and nmbufs.
475 */
476static int
477sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
478{
479 int error;
480 int value;
481
b11c11e9
MD
482 value = nmbclusters;
483 error = sysctl_handle_int(oidp, &value, 0, req);
8179ec90 484 if (error || req->newptr == NULL)
b11c11e9 485 return error;
8179ec90 486
e9c1cc61
SZ
487 if (value < NMBCLUSTERS_MIN)
488 return EINVAL;
489
8179ec90 490 lockmgr(&mbupdate_lk, LK_EXCLUSIVE);
b11c11e9
MD
491 if (nmbclusters != value) {
492 nmbclusters = value;
493 mbupdatelimits();
494 }
495 lockmgr(&mbupdate_lk, LK_RELEASE);
496 return 0;
497}
498
499static int
500sysctl_nmbjclusters(SYSCTL_HANDLER_ARGS)
501{
502 int error;
503 int value;
504
b11c11e9
MD
505 value = nmbjclusters;
506 error = sysctl_handle_int(oidp, &value, 0, req);
8179ec90 507 if (error || req->newptr == NULL)
b11c11e9 508 return error;
8179ec90 509
e9c1cc61
SZ
510 if (value < NMBJCLUSTERS_MIN)
511 return EINVAL;
512
8179ec90 513 lockmgr(&mbupdate_lk, LK_EXCLUSIVE);
b11c11e9
MD
514 if (nmbjclusters != value) {
515 nmbjclusters = value;
516 mbupdatelimits();
517 }
518 lockmgr(&mbupdate_lk, LK_RELEASE);
519 return 0;
520}
521
522static int
523sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
524{
525 int error;
526 int value;
527
b11c11e9
MD
528 value = nmbufs;
529 error = sysctl_handle_int(oidp, &value, 0, req);
8179ec90 530 if (error || req->newptr == NULL)
b11c11e9 531 return error;
8179ec90 532
e9c1cc61
SZ
533 if (value < NMBUFS_MIN)
534 return EINVAL;
535
8179ec90 536 lockmgr(&mbupdate_lk, LK_EXCLUSIVE);
b11c11e9
MD
537 if (nmbufs != value) {
538 nmbufs = value;
539 mbupdatelimits();
540 }
541 lockmgr(&mbupdate_lk, LK_RELEASE);
542 return 0;
543}
544
984263bc
MD
545/* "number of clusters of pages" */
546#define NCL_INIT 1
547
548#define NMB_INIT 16
549
7b6f875f
JH
550/*
551 * The mbuf object cache only guarantees that m_next and m_nextpkt are
552 * NULL and that m_data points to the beginning of the data area. In
553 * particular, m_len and m_pkthdr.len are uninitialized. It is the
554 * responsibility of the caller to initialize those fields before use.
555 */
db11cb20 556static __inline boolean_t
7b6f875f 557mbuf_ctor(void *obj, void *private, int ocflags)
984263bc 558{
7b6f875f 559 struct mbuf *m = obj;
984263bc 560
7b6f875f
JH
561 m->m_next = NULL;
562 m->m_nextpkt = NULL;
563 m->m_data = m->m_dat;
564 m->m_flags = 0;
565
566 return (TRUE);
984263bc
MD
567}
568
569/*
7b6f875f 570 * Initialize the mbuf and the packet header fields.
984263bc 571 */
7b6f875f
JH
572static boolean_t
573mbufphdr_ctor(void *obj, void *private, int ocflags)
984263bc 574{
7b6f875f 575 struct mbuf *m = obj;
984263bc 576
7b6f875f
JH
577 m->m_next = NULL;
578 m->m_nextpkt = NULL;
579 m->m_data = m->m_pktdat;
77e294a1 580 m->m_flags = M_PKTHDR | M_PHCACHE;
984263bc 581
7b6f875f
JH
582 m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */
583 SLIST_INIT(&m->m_pkthdr.tags);
584 m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */
585 m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */
586
587 return (TRUE);
984263bc
MD
588}
589
590/*
7b6f875f 591 * A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
984263bc 592 */
7b6f875f
JH
593static boolean_t
594mclmeta_ctor(void *obj, void *private, int ocflags)
984263bc 595{
7b6f875f
JH
596 struct mbcluster *cl = obj;
597 void *buf;
598
599 if (ocflags & M_NOWAIT)
efda3bd0 600 buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT | M_ZERO);
7b6f875f 601 else
efda3bd0 602 buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT | M_ZERO);
7b6f875f
JH
603 if (buf == NULL)
604 return (FALSE);
77e294a1 605 cl->mcl_refs = 0;
7b6f875f
JH
606 cl->mcl_data = buf;
607 return (TRUE);
608}
984263bc 609
94eaee9a
JT
610static boolean_t
611mjclmeta_ctor(void *obj, void *private, int ocflags)
612{
613 struct mbcluster *cl = obj;
614 void *buf;
615
616 if (ocflags & M_NOWAIT)
617 buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO);
618 else
619 buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO);
620 if (buf == NULL)
621 return (FALSE);
622 cl->mcl_refs = 0;
623 cl->mcl_data = buf;
624 return (TRUE);
625}
626
c3ef87ca
MD
627static void
628mclmeta_dtor(void *obj, void *private)
629{
630 struct mbcluster *mcl = obj;
631
632 KKASSERT(mcl->mcl_refs == 0);
efda3bd0 633 kfree(mcl->mcl_data, M_MBUFCL);
c3ef87ca
MD
634}
635
7b6f875f 636static void
94eaee9a 637linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size)
7b6f875f 638{
984263bc 639 /*
7b6f875f
JH
640 * Add the cluster to the mbuf. The caller will detect that the
641 * mbuf now has an attached cluster.
984263bc 642 */
7b6f875f
JH
643 m->m_ext.ext_arg = cl;
644 m->m_ext.ext_buf = cl->mcl_data;
645 m->m_ext.ext_ref = m_mclref;
c83e573d
SZ
646 if (size != MCLBYTES)
647 m->m_ext.ext_free = m_mjclfree;
648 else
649 m->m_ext.ext_free = m_mclfree;
94eaee9a 650 m->m_ext.ext_size = size;
df8d1020 651 atomic_add_int(&cl->mcl_refs, 1);
984263bc 652
7b6f875f
JH
653 m->m_data = m->m_ext.ext_buf;
654 m->m_flags |= M_EXT | M_EXT_CLUSTER;
984263bc
MD
655}
656
94eaee9a
JT
657static void
658linkcluster(struct mbuf *m, struct mbcluster *cl)
659{
660 linkjcluster(m, cl, MCLBYTES);
661}
662
7b6f875f
JH
663static boolean_t
664mbufphdrcluster_ctor(void *obj, void *private, int ocflags)
665{
666 struct mbuf *m = obj;
667 struct mbcluster *cl;
668
669 mbufphdr_ctor(obj, private, ocflags);
670 cl = objcache_get(mclmeta_cache, ocflags);
a5955b15
MD
671 if (cl == NULL) {
672 ++mbstat[mycpu->gd_cpuid].m_drops;
7b6f875f 673 return (FALSE);
a5955b15 674 }
77e294a1 675 m->m_flags |= M_CLCACHE;
7b6f875f
JH
676 linkcluster(m, cl);
677 return (TRUE);
678}
984263bc 679
94eaee9a
JT
680static boolean_t
681mbufphdrjcluster_ctor(void *obj, void *private, int ocflags)
682{
683 struct mbuf *m = obj;
684 struct mbcluster *cl;
685
686 mbufphdr_ctor(obj, private, ocflags);
687 cl = objcache_get(mjclmeta_cache, ocflags);
688 if (cl == NULL) {
689 ++mbstat[mycpu->gd_cpuid].m_drops;
690 return (FALSE);
691 }
692 m->m_flags |= M_CLCACHE;
693 linkjcluster(m, cl, MJUMPAGESIZE);
694 return (TRUE);
695}
696
7b6f875f
JH
697static boolean_t
698mbufcluster_ctor(void *obj, void *private, int ocflags)
984263bc 699{
7b6f875f
JH
700 struct mbuf *m = obj;
701 struct mbcluster *cl;
702
703 mbuf_ctor(obj, private, ocflags);
704 cl = objcache_get(mclmeta_cache, ocflags);
a5955b15
MD
705 if (cl == NULL) {
706 ++mbstat[mycpu->gd_cpuid].m_drops;
7b6f875f 707 return (FALSE);
a5955b15 708 }
77e294a1 709 m->m_flags |= M_CLCACHE;
7b6f875f
JH
710 linkcluster(m, cl);
711 return (TRUE);
712}
984263bc 713
94eaee9a
JT
714static boolean_t
715mbufjcluster_ctor(void *obj, void *private, int ocflags)
716{
717 struct mbuf *m = obj;
718 struct mbcluster *cl;
719
720 mbuf_ctor(obj, private, ocflags);
721 cl = objcache_get(mjclmeta_cache, ocflags);
722 if (cl == NULL) {
723 ++mbstat[mycpu->gd_cpuid].m_drops;
724 return (FALSE);
725 }
726 m->m_flags |= M_CLCACHE;
727 linkjcluster(m, cl, MJUMPAGESIZE);
728 return (TRUE);
729}
730
77e294a1
MD
731/*
732 * Used for both the cluster and cluster PHDR caches.
733 *
734 * The mbuf may have lost its cluster due to sharing, deal
735 * with the situation by checking M_EXT.
736 */
7b6f875f
JH
737static void
738mbufcluster_dtor(void *obj, void *private)
984263bc 739{
7b6f875f 740 struct mbuf *m = obj;
77e294a1 741 struct mbcluster *mcl;
984263bc 742
77e294a1
MD
743 if (m->m_flags & M_EXT) {
744 KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
745 mcl = m->m_ext.ext_arg;
746 KKASSERT(mcl->mcl_refs == 1);
747 mcl->mcl_refs = 0;
94eaee9a
JT
748 if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES)
749 objcache_put(mjclmeta_cache, mcl);
750 else
751 objcache_put(mclmeta_cache, mcl);
77e294a1 752 }
984263bc
MD
753}
754
7b6f875f
JH
755struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
756struct objcache_malloc_args mclmeta_malloc_args =
757 { sizeof(struct mbcluster), M_MCLMETA };
758
759/* ARGSUSED*/
90775e29 760static void
7b6f875f 761mbinit(void *dummy)
984263bc 762{
8033b958 763 int mb_limit, cl_limit, ncl_limit, jcl_limit;
0aa16b5d 764 int limit;
4c1e2509
JT
765 int i;
766
0aa16b5d
SZ
767 /*
768 * Initialize statistics
769 */
770 for (i = 0; i < ncpus; i++) {
461213b7
MD
771 mbstat[i].m_msize = MSIZE;
772 mbstat[i].m_mclbytes = MCLBYTES;
773 mbstat[i].m_mjumpagesize = MJUMPAGESIZE;
774 mbstat[i].m_minclsize = MINCLSIZE;
775 mbstat[i].m_mlen = MLEN;
776 mbstat[i].m_mhlen = MHLEN;
4c1e2509 777 }
984263bc 778
0aa16b5d 779 /*
b11c11e9 780 * Create object caches and save cluster limits, which will
0aa16b5d
SZ
781 * be used to adjust backing kmalloc pools' limit later.
782 */
783
6f21e2f4 784 mb_limit = cl_limit = 0;
0aa16b5d
SZ
785
786 limit = nmbufs;
3508d9a1 787 mbuf_cache = objcache_create("mbuf",
d5b73e64 788 limit, nmbufs / 4,
5b7da64a 789 mbuf_ctor, NULL, NULL,
7b6f875f 790 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
6f21e2f4 791 mb_limit += limit;
0aa16b5d
SZ
792
793 limit = nmbufs;
3508d9a1 794 mbufphdr_cache = objcache_create("mbuf pkt hdr",
2fce2579 795 limit, nmbufs / 4,
5b7da64a 796 mbufphdr_ctor, NULL, NULL,
7b6f875f 797 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
6f21e2f4 798 mb_limit += limit;
0aa16b5d 799
8033b958 800 ncl_limit = nmbclusters;
3508d9a1 801 mclmeta_cache = objcache_create("cluster mbuf",
d5b73e64 802 ncl_limit, nmbclusters / 4,
7b6f875f
JH
803 mclmeta_ctor, mclmeta_dtor, NULL,
804 objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
8033b958 805 cl_limit += ncl_limit;
0aa16b5d 806
8033b958 807 jcl_limit = nmbjclusters;
3508d9a1 808 mjclmeta_cache = objcache_create("jcluster mbuf",
d5b73e64 809 jcl_limit, nmbjclusters / 4,
94eaee9a
JT
810 mjclmeta_ctor, mclmeta_dtor, NULL,
811 objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
8033b958 812 cl_limit += jcl_limit;
94eaee9a 813
0aa16b5d 814 limit = nmbclusters;
3508d9a1 815 mbufcluster_cache = objcache_create("mbuf + cluster",
2fce2579 816 limit, nmbclusters / mcl_cachefrac,
7b6f875f
JH
817 mbufcluster_ctor, mbufcluster_dtor, NULL,
818 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
6f21e2f4 819 mb_limit += limit;
0aa16b5d
SZ
820
821 limit = nmbclusters;
7b6f875f 822 mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
2fce2579 823 limit, nmbclusters / mclph_cachefrac,
3508d9a1 824 mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
7b6f875f 825 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
6f21e2f4 826 mb_limit += limit;
0aa16b5d 827
c83e573d 828 limit = nmbjclusters;
3508d9a1 829 mbufjcluster_cache = objcache_create("mbuf + jcluster",
d5b73e64 830 limit, nmbjclusters / mjcl_cachefrac,
94eaee9a
JT
831 mbufjcluster_ctor, mbufcluster_dtor, NULL,
832 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
8033b958 833 mb_limit += limit;
94eaee9a 834
8033b958 835 limit = nmbjclusters;
94eaee9a 836 mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster",
d5b73e64 837 limit, nmbjclusters / mjclph_cachefrac,
3508d9a1 838 mbufphdrjcluster_ctor, mbufcluster_dtor, NULL,
94eaee9a 839 objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
8033b958 840 mb_limit += limit;
94eaee9a 841
0aa16b5d
SZ
842 /*
843 * Adjust backing kmalloc pools' limit
3f98f485
SZ
844 *
845 * NOTE: We raise the limit by another 1/8 to take the effect
846 * of loosememuse into account.
0aa16b5d 847 */
3f98f485 848 cl_limit += cl_limit / 8;
0aa16b5d 849 kmalloc_raise_limit(mclmeta_malloc_args.mtype,
d5b73e64 850 mclmeta_malloc_args.objsize * (size_t)cl_limit);
430919cb 851 kmalloc_raise_limit(M_MBUFCL,
d5b73e64
MD
852 (MCLBYTES * (size_t)ncl_limit) +
853 (MJUMPAGESIZE * (size_t)jcl_limit));
0aa16b5d 854
3f98f485 855 mb_limit += mb_limit / 8;
0aa16b5d 856 kmalloc_raise_limit(mbuf_malloc_args.mtype,
d5b73e64 857 mbuf_malloc_args.objsize * (size_t)mb_limit);
90775e29 858}
984263bc 859
b11c11e9
MD
860/*
861 * Adjust mbuf limits after changes have been made
862 *
863 * Caller must hold mbupdate_lk
864 */
865void
866mbupdatelimits(void)
867{
868 int mb_limit, cl_limit, ncl_limit, jcl_limit;
869 int limit;
870
8179ec90
SZ
871 KASSERT(lockstatus(&mbupdate_lk, curthread) != 0,
872 ("mbupdate_lk is not held"));
873
b11c11e9
MD
874 /*
875 * Figure out adjustments to object caches after nmbufs, nmbclusters,
876 * or nmbjclusters has been modified.
877 */
878 mb_limit = cl_limit = 0;
879
880 limit = nmbufs;
881 objcache_set_cluster_limit(mbuf_cache, limit);
882 mb_limit += limit;
883
884 limit = nmbufs;
885 objcache_set_cluster_limit(mbufphdr_cache, limit);
886 mb_limit += limit;
887
888 ncl_limit = nmbclusters;
889 objcache_set_cluster_limit(mclmeta_cache, ncl_limit);
890 cl_limit += ncl_limit;
891
892 jcl_limit = nmbjclusters;
893 objcache_set_cluster_limit(mjclmeta_cache, jcl_limit);
894 cl_limit += jcl_limit;
895
896 limit = nmbclusters;
897 objcache_set_cluster_limit(mbufcluster_cache, limit);
898 mb_limit += limit;
899
900 limit = nmbclusters;
901 objcache_set_cluster_limit(mbufphdrcluster_cache, limit);
902 mb_limit += limit;
903
904 limit = nmbjclusters;
905 objcache_set_cluster_limit(mbufjcluster_cache, limit);
906 mb_limit += limit;
907
908 limit = nmbjclusters;
909 objcache_set_cluster_limit(mbufphdrjcluster_cache, limit);
910 mb_limit += limit;
911
912 /*
913 * Adjust backing kmalloc pools' limit
914 *
915 * NOTE: We raise the limit by another 1/8 to take the effect
916 * of loosememuse into account.
917 */
918 cl_limit += cl_limit / 8;
919 kmalloc_raise_limit(mclmeta_malloc_args.mtype,
920 mclmeta_malloc_args.objsize * (size_t)cl_limit);
921 kmalloc_raise_limit(M_MBUFCL,
922 (MCLBYTES * (size_t)ncl_limit) +
923 (MJUMPAGESIZE * (size_t)jcl_limit));
924 mb_limit += mb_limit / 8;
925 kmalloc_raise_limit(mbuf_malloc_args.mtype,
926 mbuf_malloc_args.objsize * (size_t)mb_limit);
927}
928
90775e29
MD
929/*
930 * Return the number of references to this mbuf's data. 0 is returned
931 * if the mbuf is not M_EXT, a reference count is returned if it is
7b6f875f 932 * M_EXT | M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
90775e29
MD
933 */
934int
935m_sharecount(struct mbuf *m)
936{
7b6f875f
JH
937 switch (m->m_flags & (M_EXT | M_EXT_CLUSTER)) {
938 case 0:
939 return (0);
940 case M_EXT:
941 return (99);
942 case M_EXT | M_EXT_CLUSTER:
943 return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
944 }
945 /* NOTREACHED */
946 return (0); /* to shut up compiler */
90775e29
MD
947}
948
949/*
950 * change mbuf to new type
951 */
952void
953m_chtype(struct mbuf *m, int type)
954{
4c1e2509
JT
955 struct globaldata *gd = mycpu;
956
a8824a1d
SZ
957 ++mbtypes[gd->gd_cpuid].stats[type];
958 --mbtypes[gd->gd_cpuid].stats[m->m_type];
461213b7 959 m->m_type = type;
984263bc
MD
960}
961
984263bc 962static void
8a3125c6 963m_reclaim(void)
984263bc 964{
1fd87d54
RG
965 struct domain *dp;
966 struct protosw *pr;
984263bc 967
5bd48c1d
MD
968 kprintf("Debug: m_reclaim() called\n");
969
9c70fe43 970 SLIST_FOREACH(dp, &domains, dom_next) {
8a3125c6 971 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
984263bc
MD
972 if (pr->pr_drain)
973 (*pr->pr_drain)();
8a3125c6
MD
974 }
975 }
461213b7 976 ++mbstat[mycpu->gd_cpuid].m_drain;
984263bc
MD
977}
978
db11cb20 979static __inline void
7b6f875f
JH
980updatestats(struct mbuf *m, int type)
981{
4c1e2509 982 struct globaldata *gd = mycpu;
7b6f875f 983
fcd1202a 984 m->m_type = type;
e9fa4b60 985 mbuftrack(m);
982f999d
MD
986#ifdef MBUF_DEBUG
987 KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m));
988 KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m));
989#endif
4c1e2509 990
a8824a1d 991 ++mbtypes[gd->gd_cpuid].stats[type];
461213b7 992 ++mbstat[gd->gd_cpuid].m_mbufs;
4c1e2509 993
7b6f875f
JH
994}
995
984263bc 996/*
7b6f875f 997 * Allocate an mbuf.
984263bc
MD
998 */
999struct mbuf *
8a3125c6 1000m_get(int how, int type)
984263bc 1001{
12496bdf 1002 struct mbuf *m;
7b6f875f 1003 int ntries = 0;
b5523eac 1004 int ocf = MB_OCFLAG(how);
12496bdf 1005
7b6f875f
JH
1006retryonce:
1007
1008 m = objcache_get(mbuf_cache, ocf);
1009
1010 if (m == NULL) {
b5523eac 1011 if ((ocf & M_WAITOK) && ntries++ == 0) {
7b6f875f
JH
1012 struct objcache *reclaimlist[] = {
1013 mbufphdr_cache,
5bd48c1d 1014 mbufcluster_cache,
94eaee9a
JT
1015 mbufphdrcluster_cache,
1016 mbufjcluster_cache,
1017 mbufphdrjcluster_cache
7b6f875f 1018 };
a3034532 1019 const int nreclaims = NELEM(reclaimlist);
7b6f875f
JH
1020
1021 if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
1022 m_reclaim();
1023 goto retryonce;
c6339e39 1024 }
a5955b15 1025 ++mbstat[mycpu->gd_cpuid].m_drops;
7b6f875f 1026 return (NULL);
12496bdf 1027 }
982f999d
MD
1028#ifdef MBUF_DEBUG
1029 KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m));
1030#endif
5bd08532 1031 m->m_len = 0;
c6339e39 1032
7b6f875f 1033 updatestats(m, type);
984263bc
MD
1034 return (m);
1035}
1036
1037struct mbuf *
8a3125c6 1038m_gethdr(int how, int type)
984263bc 1039{
12496bdf 1040 struct mbuf *m;
b5523eac 1041 int ocf = MB_OCFLAG(how);
7b6f875f 1042 int ntries = 0;
12496bdf 1043
7b6f875f
JH
1044retryonce:
1045
1046 m = objcache_get(mbufphdr_cache, ocf);
1047
1048 if (m == NULL) {
b5523eac 1049 if ((ocf & M_WAITOK) && ntries++ == 0) {
7b6f875f
JH
1050 struct objcache *reclaimlist[] = {
1051 mbuf_cache,
94eaee9a
JT
1052 mbufcluster_cache, mbufphdrcluster_cache,
1053 mbufjcluster_cache, mbufphdrjcluster_cache
7b6f875f 1054 };
a3034532 1055 const int nreclaims = NELEM(reclaimlist);
7b6f875f
JH
1056
1057 if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
1058 m_reclaim();
1059 goto retryonce;
c6339e39 1060 }
a5955b15 1061 ++mbstat[mycpu->gd_cpuid].m_drops;
7b6f875f 1062 return (NULL);
12496bdf 1063 }
982f999d
MD
1064#ifdef MBUF_DEBUG
1065 KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m));
1066#endif
5bd08532
MD
1067 m->m_len = 0;
1068 m->m_pkthdr.len = 0;
c6339e39 1069
7b6f875f 1070 updatestats(m, type);
984263bc
MD
1071 return (m);
1072}
1073
7b6f875f
JH
1074/*
1075 * Get a mbuf (not a mbuf cluster!) and zero it.
1076 * Deprecated.
1077 */
984263bc 1078struct mbuf *
8a3125c6 1079m_getclr(int how, int type)
984263bc 1080{
1fd87d54 1081 struct mbuf *m;
984263bc 1082
7b6f875f
JH
1083 m = m_get(how, type);
1084 if (m != NULL)
1085 bzero(m->m_data, MLEN);
984263bc
MD
1086 return (m);
1087}
1088
9c24e04a
SZ
1089static struct mbuf *
1090m_getcl_cache(int how, short type, int flags, struct objcache *mbclc,
c83e573d 1091 struct objcache *mbphclc, u_long *cl_stats)
94eaee9a
JT
1092{
1093 struct mbuf *m = NULL;
b5523eac 1094 int ocflags = MB_OCFLAG(how);
94eaee9a
JT
1095 int ntries = 0;
1096
1097retryonce:
1098
1099 if (flags & M_PKTHDR)
2e7afdb4 1100 m = objcache_get(mbphclc, ocflags);
94eaee9a 1101 else
2e7afdb4 1102 m = objcache_get(mbclc, ocflags);
94eaee9a
JT
1103
1104 if (m == NULL) {
b5523eac 1105 if ((ocflags & M_WAITOK) && ntries++ == 0) {
94eaee9a
JT
1106 struct objcache *reclaimlist[1];
1107
1108 if (flags & M_PKTHDR)
2e7afdb4 1109 reclaimlist[0] = mbclc;
94eaee9a 1110 else
2e7afdb4 1111 reclaimlist[0] = mbphclc;
94eaee9a
JT
1112 if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
1113 m_reclaim();
1114 goto retryonce;
1115 }
1116 ++mbstat[mycpu->gd_cpuid].m_drops;
1117 return (NULL);
1118 }
1119
1120#ifdef MBUF_DEBUG
1121 KASSERT(m->m_data == m->m_ext.ext_buf,
1122 ("mbuf %p: bad m_data in get", m));
1123#endif
1124 m->m_type = type;
1125 m->m_len = 0;
1126 m->m_pkthdr.len = 0; /* just do it unconditonally */
1127
1128 mbuftrack(m);
1129
a8824a1d 1130 ++mbtypes[mycpu->gd_cpuid].stats[type];
c83e573d 1131 ++(*cl_stats);
94eaee9a
JT
1132 return (m);
1133}
1134
9c24e04a
SZ
1135struct mbuf *
1136m_getjcl(int how, short type, int flags, size_t size)
1137{
1138 struct objcache *mbclc, *mbphclc;
c83e573d 1139 u_long *cl_stats;
9c24e04a
SZ
1140
1141 switch (size) {
1142 case MCLBYTES:
1143 mbclc = mbufcluster_cache;
1144 mbphclc = mbufphdrcluster_cache;
c83e573d 1145 cl_stats = &mbstat[mycpu->gd_cpuid].m_clusters;
9c24e04a
SZ
1146 break;
1147
1148 default:
1149 mbclc = mbufjcluster_cache;
1150 mbphclc = mbufphdrjcluster_cache;
c83e573d 1151 cl_stats = &mbstat[mycpu->gd_cpuid].m_jclusters;
9c24e04a
SZ
1152 break;
1153 }
c83e573d 1154 return m_getcl_cache(how, type, flags, mbclc, mbphclc, cl_stats);
9c24e04a
SZ
1155}
1156
984263bc 1157/*
7b6f875f 1158 * Returns an mbuf with an attached cluster.
984263bc
MD
1159 * Because many network drivers use this kind of buffers a lot, it is
1160 * convenient to keep a small pool of free buffers of this kind.
1161 * Even a small size such as 10 gives about 10% improvement in the
1162 * forwarding rate in a bridge or router.
984263bc 1163 */
984263bc
MD
1164struct mbuf *
1165m_getcl(int how, short type, int flags)
1166{
9c24e04a 1167 return m_getcl_cache(how, type, flags,
c83e573d
SZ
1168 mbufcluster_cache, mbufphdrcluster_cache,
1169 &mbstat[mycpu->gd_cpuid].m_clusters);
984263bc
MD
1170}
1171
1172/*
50503f0f
JH
1173 * Allocate chain of requested length.
1174 */
1175struct mbuf *
1176m_getc(int len, int how, int type)
1177{
1178 struct mbuf *n, *nfirst = NULL, **ntail = &nfirst;
1179 int nsize;
1180
1181 while (len > 0) {
1182 n = m_getl(len, how, type, 0, &nsize);
1183 if (n == NULL)
1184 goto failed;
1185 n->m_len = 0;
1186 *ntail = n;
1187 ntail = &n->m_next;
1188 len -= nsize;
1189 }
1190 return (nfirst);
1191
1192failed:
1193 m_freem(nfirst);
1194 return (NULL);
1195}
1196
1197/*
1198 * Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
1199 * and return a pointer to the head of the allocated chain. If m0 is
984263bc
MD
1200 * non-null, then we assume that it is a single mbuf or an mbuf chain to
1201 * which we want len bytes worth of mbufs and/or clusters attached, and so
50503f0f 1202 * if we succeed in allocating it, we will just return a pointer to m0.
984263bc
MD
1203 *
1204 * If we happen to fail at any point during the allocation, we will free
1205 * up everything we have already allocated and return NULL.
1206 *
50503f0f 1207 * Deprecated. Use m_getc() and m_cat() instead.
984263bc
MD
1208 */
1209struct mbuf *
dc14b0a9 1210m_getm(struct mbuf *m0, int len, int type, int how)
984263bc 1211{
50503f0f 1212 struct mbuf *nfirst;
984263bc 1213
50503f0f 1214 nfirst = m_getc(len, how, type);
984263bc 1215
50503f0f
JH
1216 if (m0 != NULL) {
1217 m_last(m0)->m_next = nfirst;
1218 return (m0);
984263bc
MD
1219 }
1220
50503f0f 1221 return (nfirst);
984263bc
MD
1222}
1223
1224/*
7b6f875f
JH
1225 * Adds a cluster to a normal mbuf, M_EXT is set on success.
1226 * Deprecated. Use m_getcl() instead.
b6650ec0 1227 */
90775e29
MD
1228void
1229m_mclget(struct mbuf *m, int how)
b6650ec0 1230{
7b6f875f 1231 struct mbcluster *mcl;
b6650ec0 1232
77e294a1 1233 KKASSERT((m->m_flags & M_EXT) == 0);
b5523eac 1234 mcl = objcache_get(mclmeta_cache, MB_OCFLAG(how));
c3ef87ca
MD
1235 if (mcl != NULL) {
1236 linkcluster(m, mcl);
461213b7 1237 ++mbstat[mycpu->gd_cpuid].m_clusters;
a5955b15
MD
1238 } else {
1239 ++mbstat[mycpu->gd_cpuid].m_drops;
c3ef87ca 1240 }
b6650ec0
MD
1241}
1242
df8d1020
MD
1243/*
1244 * Updates to mbcluster must be MPSAFE. Only an entity which already has
1245 * a reference to the cluster can ref it, so we are in no danger of
1246 * racing an add with a subtract. But the operation must still be atomic
1247 * since multiple entities may have a reference on the cluster.
1248 *
1249 * m_mclfree() is almost the same but it must contend with two entities
5bd48c1d 1250 * freeing the cluster at the same time.
df8d1020 1251 */
90775e29 1252static void
7b6f875f 1253m_mclref(void *arg)
b6650ec0 1254{
7b6f875f 1255 struct mbcluster *mcl = arg;
90775e29 1256
7b6f875f 1257 atomic_add_int(&mcl->mcl_refs, 1);
b6650ec0
MD
1258}
1259
1d16b2b5
MD
1260/*
1261 * When dereferencing a cluster we have to deal with a N->0 race, where
1262 * N entities free their references simultaniously. To do this we use
dee87a60 1263 * atomic_fetchadd_int().
1d16b2b5 1264 */
90775e29 1265static void
7b6f875f 1266m_mclfree(void *arg)
b6650ec0 1267{
7b6f875f 1268 struct mbcluster *mcl = arg;
90775e29 1269
461213b7
MD
1270 if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1271 --mbstat[mycpu->gd_cpuid].m_clusters;
77e294a1 1272 objcache_put(mclmeta_cache, mcl);
461213b7 1273 }
b6650ec0
MD
1274}
1275
c83e573d
SZ
1276static void
1277m_mjclfree(void *arg)
1278{
1279 struct mbcluster *mcl = arg;
1280
1281 if (atomic_fetchadd_int(&mcl->mcl_refs, -1) == 1) {
1282 --mbstat[mycpu->gd_cpuid].m_jclusters;
1283 objcache_put(mjclmeta_cache, mcl);
1284 }
1285}
1286
b6650ec0 1287/*
b6650ec0
MD
1288 * Free a single mbuf and any associated external storage. The successor,
1289 * if any, is returned.
984263bc 1290 *
b6650ec0 1291 * We do need to check non-first mbuf for m_aux, since some of existing
984263bc
MD
1292 * code does not call M_PREPEND properly.
1293 * (example: call to bpf_mtap from drivers)
1294 */
982f999d
MD
1295
1296#ifdef MBUF_DEBUG
1297
1298struct mbuf *
1299_m_free(struct mbuf *m, const char *func)
1300
1301#else
1302
984263bc 1303struct mbuf *
b6650ec0 1304m_free(struct mbuf *m)
982f999d
MD
1305
1306#endif
984263bc 1307{
b6650ec0 1308 struct mbuf *n;
4c1e2509 1309 struct globaldata *gd = mycpu;
b6650ec0 1310
361af367 1311 KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
f3f0fc49 1312 KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m));
a8824a1d 1313 --mbtypes[gd->gd_cpuid].stats[m->m_type];
90775e29 1314
7b6f875f 1315 n = m->m_next;
90775e29
MD
1316
1317 /*
7b6f875f
JH
1318 * Make sure the mbuf is in constructed state before returning it
1319 * to the objcache.
90775e29 1320 */
90775e29 1321 m->m_next = NULL;
e9fa4b60 1322 mbufuntrack(m);
982f999d
MD
1323#ifdef MBUF_DEBUG
1324 m->m_hdr.mh_lastfunc = func;
1325#endif
7b6f875f
JH
1326#ifdef notyet
1327 KKASSERT(m->m_nextpkt == NULL);
1328#else
1329 if (m->m_nextpkt != NULL) {
7b6f875f
JH
1330 static int afewtimes = 10;
1331
1332 if (afewtimes-- > 0) {
6ea70f76 1333 kprintf("mfree: m->m_nextpkt != NULL\n");
7ce2998e 1334 print_backtrace(-1);
90775e29 1335 }
7b6f875f
JH
1336 m->m_nextpkt = NULL;
1337 }
1338#endif
1339 if (m->m_flags & M_PKTHDR) {
7b6f875f 1340 m_tag_delete_chain(m); /* eliminate XXX JH */
77e294a1
MD
1341 }
1342
1343 m->m_flags &= (M_EXT | M_EXT_CLUSTER | M_CLCACHE | M_PHCACHE);
1344
1345 /*
1346 * Clean the M_PKTHDR state so we can return the mbuf to its original
1347 * cache. This is based on the PHCACHE flag which tells us whether
1348 * the mbuf was originally allocated out of a packet-header cache
1349 * or a non-packet-header cache.
1350 */
1351 if (m->m_flags & M_PHCACHE) {
1352 m->m_flags |= M_PKTHDR;
1353 m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */
7b6f875f
JH
1354 m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */
1355 m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */
6b1d6bed 1356 SLIST_INIT(&m->m_pkthdr.tags);
90775e29 1357 }
7b6f875f 1358
77e294a1
MD
1359 /*
1360 * Handle remaining flags combinations. M_CLCACHE tells us whether
1361 * the mbuf was originally allocated from a cluster cache or not,
1362 * and is totally separate from whether the mbuf is currently
1363 * associated with a cluster.
1364 */
77e294a1
MD
1365 switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) {
1366 case M_CLCACHE | M_EXT | M_EXT_CLUSTER:
1367 /*
1368 * mbuf+cluster cache case. The mbuf was allocated from the
1369 * combined mbuf_cluster cache and can be returned to the
1370 * cache if the cluster hasn't been shared.
1371 */
1372 if (m_sharecount(m) == 1) {
1373 /*
1374 * The cluster has not been shared, we can just
1375 * reset the data pointer and return the mbuf
1376 * to the cluster cache. Note that the reference
1377 * count is left intact (it is still associated with
1378 * an mbuf).
1379 */
1380 m->m_data = m->m_ext.ext_buf;
94eaee9a
JT
1381 if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) {
1382 if (m->m_flags & M_PHCACHE)
1383 objcache_put(mbufphdrjcluster_cache, m);
1384 else
1385 objcache_put(mbufjcluster_cache, m);
c83e573d 1386 --mbstat[mycpu->gd_cpuid].m_jclusters;
94eaee9a
JT
1387 } else {
1388 if (m->m_flags & M_PHCACHE)
1389 objcache_put(mbufphdrcluster_cache, m);
1390 else
1391 objcache_put(mbufcluster_cache, m);
c83e573d 1392 --mbstat[mycpu->gd_cpuid].m_clusters;
94eaee9a 1393 }
77e294a1
MD
1394 } else {
1395 /*
1396 * Hell. Someone else has a ref on this cluster,
1397 * we have to disconnect it which means we can't
1398 * put it back into the mbufcluster_cache, we
1399 * have to destroy the mbuf.
1400 *
cb086467
MD
1401 * Other mbuf references to the cluster will typically
1402 * be M_EXT | M_EXT_CLUSTER but without M_CLCACHE.
1403 *
77e294a1
MD
1404 * XXX we could try to connect another cluster to
1405 * it.
1406 */
7b6f875f
JH
1407 m->m_ext.ext_free(m->m_ext.ext_arg);
1408 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
94eaee9a
JT
1409 if (m->m_ext.ext_size == MCLBYTES) {
1410 if (m->m_flags & M_PHCACHE)
1411 objcache_dtor(mbufphdrcluster_cache, m);
1412 else
1413 objcache_dtor(mbufcluster_cache, m);
1414 } else {
1415 if (m->m_flags & M_PHCACHE)
1416 objcache_dtor(mbufphdrjcluster_cache, m);
1417 else
1418 objcache_dtor(mbufjcluster_cache, m);
1419 }
7b6f875f 1420 }
77e294a1
MD
1421 break;
1422 case M_EXT | M_EXT_CLUSTER:
77e294a1
MD
1423 case M_EXT:
1424 /*
1425 * Normal cluster association case, disconnect the cluster from
1426 * the mbuf. The cluster may or may not be custom.
1427 */
1428 m->m_ext.ext_free(m->m_ext.ext_arg);
1429 m->m_flags &= ~(M_EXT | M_EXT_CLUSTER);
1430 /* fall through */
1431 case 0:
1432 /*
1433 * return the mbuf to the mbuf cache.
1434 */
1435 if (m->m_flags & M_PHCACHE) {
7b6f875f
JH
1436 m->m_data = m->m_pktdat;
1437 objcache_put(mbufphdr_cache, m);
90775e29 1438 } else {
7b6f875f
JH
1439 m->m_data = m->m_dat;
1440 objcache_put(mbuf_cache, m);
90775e29 1441 }
461213b7 1442 --mbstat[mycpu->gd_cpuid].m_mbufs;
77e294a1
MD
1443 break;
1444 default:
1445 if (!panicstr)
ed20d0e3 1446 panic("bad mbuf flags %p %08x", m, m->m_flags);
77e294a1 1447 break;
b6650ec0 1448 }
984263bc
MD
1449 return (n);
1450}
1451
982f999d
MD
1452#ifdef MBUF_DEBUG
1453
1454void
1455_m_freem(struct mbuf *m, const char *func)
1456{
1457 while (m)
1458 m = _m_free(m, func);
1459}
1460
1461#else
1462
984263bc 1463void
b6650ec0 1464m_freem(struct mbuf *m)
984263bc 1465{
90775e29
MD
1466 while (m)
1467 m = m_free(m);
984263bc
MD
1468}
1469
982f999d
MD
1470#endif
1471
7c85e8ac
SW
1472void
1473m_extadd(struct mbuf *m, caddr_t buf, u_int size, void (*reff)(void *),
1474 void (*freef)(void *), void *arg)
1475{
1476 m->m_ext.ext_arg = arg;
1477 m->m_ext.ext_buf = buf;
1478 m->m_ext.ext_ref = reff;
1479 m->m_ext.ext_free = freef;
1480 m->m_ext.ext_size = size;
1481 reff(arg);
1482 m->m_data = buf;
1483 m->m_flags |= M_EXT;
1484}
1485
984263bc 1486/*
df80f2ea 1487 * mbuf utility routines
984263bc
MD
1488 */
1489
1490/*
7b6f875f 1491 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
984263bc
MD
1492 * copy junk along.
1493 */
1494struct mbuf *
8a3125c6 1495m_prepend(struct mbuf *m, int len, int how)
984263bc
MD
1496{
1497 struct mbuf *mn;
1498
c3ef87ca
MD
1499 if (m->m_flags & M_PKTHDR)
1500 mn = m_gethdr(how, m->m_type);
1501 else
1502 mn = m_get(how, m->m_type);
7b6f875f 1503 if (mn == NULL) {
984263bc 1504 m_freem(m);
7b6f875f 1505 return (NULL);
984263bc
MD
1506 }
1507 if (m->m_flags & M_PKTHDR)
1508 M_MOVE_PKTHDR(mn, m);
1509 mn->m_next = m;
1510 m = mn;
1511 if (len < MHLEN)
1512 MH_ALIGN(m, len);
1513 m->m_len = len;
1514 return (m);
1515}
1516
1517/*
1518 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
1519 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
b5523eac 1520 * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
984263bc
MD
1521 * Note that the copy is read-only, because clusters are not copied,
1522 * only their reference counts are incremented.
1523 */
984263bc 1524struct mbuf *
8a3125c6 1525m_copym(const struct mbuf *m, int off0, int len, int wait)
984263bc 1526{
1fd87d54
RG
1527 struct mbuf *n, **np;
1528 int off = off0;
984263bc
MD
1529 struct mbuf *top;
1530 int copyhdr = 0;
1531
1532 KASSERT(off >= 0, ("m_copym, negative off %d", off));
1533 KASSERT(len >= 0, ("m_copym, negative len %d", len));
5bd48c1d 1534 if (off == 0 && (m->m_flags & M_PKTHDR))
984263bc
MD
1535 copyhdr = 1;
1536 while (off > 0) {
1537 KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
1538 if (off < m->m_len)
1539 break;
1540 off -= m->m_len;
1541 m = m->m_next;
1542 }
1543 np = &top;
5bd48c1d 1544 top = NULL;
984263bc 1545 while (len > 0) {
7b6f875f 1546 if (m == NULL) {
984263bc
MD
1547 KASSERT(len == M_COPYALL,
1548 ("m_copym, length > size of mbuf chain"));
1549 break;
1550 }
c3ef87ca
MD
1551 /*
1552 * Because we are sharing any cluster attachment below,
1553 * be sure to get an mbuf that does not have a cluster
1554 * associated with it.
1555 */
1556 if (copyhdr)
1557 n = m_gethdr(wait, m->m_type);
1558 else
1559 n = m_get(wait, m->m_type);
984263bc 1560 *np = n;
7b6f875f 1561 if (n == NULL)
984263bc
MD
1562 goto nospace;
1563 if (copyhdr) {
1564 if (!m_dup_pkthdr(n, m, wait))
1565 goto nospace;
1566 if (len == M_COPYALL)
1567 n->m_pkthdr.len -= off0;
1568 else
1569 n->m_pkthdr.len = len;
1570 copyhdr = 0;
1571 }
1572 n->m_len = min(len, m->m_len - off);
1573 if (m->m_flags & M_EXT) {
c3ef87ca 1574 KKASSERT((n->m_flags & M_EXT) == 0);
984263bc 1575 n->m_data = m->m_data + off;
7b6f875f 1576 m->m_ext.ext_ref(m->m_ext.ext_arg);
984263bc 1577 n->m_ext = m->m_ext;
b542cd49 1578 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
7eccf245 1579 } else {
984263bc
MD
1580 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
1581 (unsigned)n->m_len);
7eccf245 1582 }
984263bc
MD
1583 if (len != M_COPYALL)
1584 len -= n->m_len;
1585 off = 0;
1586 m = m->m_next;
1587 np = &n->m_next;
1588 }
7b6f875f 1589 if (top == NULL)
461213b7 1590 ++mbstat[mycpu->gd_cpuid].m_mcfail;
984263bc
MD
1591 return (top);
1592nospace:
1593 m_freem(top);
461213b7 1594 ++mbstat[mycpu->gd_cpuid].m_mcfail;
7b6f875f 1595 return (NULL);
984263bc
MD
1596}
1597
1598/*
1599 * Copy an entire packet, including header (which must be present).
1600 * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
1601 * Note that the copy is read-only, because clusters are not copied,
1602 * only their reference counts are incremented.
1603 * Preserve alignment of the first mbuf so if the creator has left
1604 * some room at the beginning (e.g. for inserting protocol headers)
1605 * the copies also have the room available.
1606 */
1607struct mbuf *
8a3125c6 1608m_copypacket(struct mbuf *m, int how)
984263bc
MD
1609{
1610 struct mbuf *top, *n, *o;
1611
7f3602fe 1612 n = m_gethdr(how, m->m_type);
984263bc
MD
1613 top = n;
1614 if (!n)
1615 goto nospace;
1616
1617 if (!m_dup_pkthdr(n, m, how))
1618 goto nospace;
1619 n->m_len = m->m_len;
1620 if (m->m_flags & M_EXT) {
c3ef87ca 1621 KKASSERT((n->m_flags & M_EXT) == 0);
984263bc 1622 n->m_data = m->m_data;
7b6f875f 1623 m->m_ext.ext_ref(m->m_ext.ext_arg);
984263bc 1624 n->m_ext = m->m_ext;
b542cd49 1625 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
984263bc
MD
1626 } else {
1627 n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
1628 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1629 }
1630
1631 m = m->m_next;
1632 while (m) {
7b6f875f 1633 o = m_get(how, m->m_type);
984263bc
MD
1634 if (!o)
1635 goto nospace;
1636
1637 n->m_next = o;
1638 n = n->m_next;
1639
1640 n->m_len = m->m_len;
1641 if (m->m_flags & M_EXT) {
c3ef87ca 1642 KKASSERT((n->m_flags & M_EXT) == 0);
984263bc 1643 n->m_data = m->m_data;
7b6f875f 1644 m->m_ext.ext_ref(m->m_ext.ext_arg);
984263bc 1645 n->m_ext = m->m_ext;
b542cd49 1646 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
984263bc
MD
1647 } else {
1648 bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
1649 }
1650
1651 m = m->m_next;
1652 }
1653 return top;
1654nospace:
1655 m_freem(top);
461213b7 1656 ++mbstat[mycpu->gd_cpuid].m_mcfail;
7b6f875f 1657 return (NULL);
984263bc
MD
1658}
1659
1660/*
1661 * Copy data from an mbuf chain starting "off" bytes from the beginning,
1662 * continuing for "len" bytes, into the indicated buffer.
1663 */
1664void
8a3125c6 1665m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
984263bc 1666{
1fd87d54 1667 unsigned count;
984263bc
MD
1668
1669 KASSERT(off >= 0, ("m_copydata, negative off %d", off));
1670 KASSERT(len >= 0, ("m_copydata, negative len %d", len));
1671 while (off > 0) {
1672 KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
1673 if (off < m->m_len)
1674 break;
1675 off -= m->m_len;
1676 m = m->m_next;
1677 }
1678 while (len > 0) {
1679 KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
1680 count = min(m->m_len - off, len);
1681 bcopy(mtod(m, caddr_t) + off, cp, count);
1682 len -= count;
1683 cp += count;
1684 off = 0;
1685 m = m->m_next;
1686 }
1687}
1688
1689/*
1690 * Copy a packet header mbuf chain into a completely new chain, including
1691 * copying any mbuf clusters. Use this instead of m_copypacket() when
1692 * you need a writable copy of an mbuf chain.
1693 */
1694struct mbuf *
8a3125c6 1695m_dup(struct mbuf *m, int how)
984263bc
MD
1696{
1697 struct mbuf **p, *top = NULL;
1698 int remain, moff, nsize;
1699
1700 /* Sanity check */
1701 if (m == NULL)
50503f0f 1702 return (NULL);
5e2195bf 1703 KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
984263bc
MD
1704
1705 /* While there's more data, get a new mbuf, tack it on, and fill it */
1706 remain = m->m_pkthdr.len;
1707 moff = 0;
1708 p = &top;
1709 while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */
1710 struct mbuf *n;
1711
1712 /* Get the next new mbuf */
50503f0f
JH
1713 n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
1714 &nsize);
984263bc
MD
1715 if (n == NULL)
1716 goto nospace;
50503f0f 1717 if (top == NULL)
984263bc 1718 if (!m_dup_pkthdr(n, m, how))
50503f0f 1719 goto nospace0;
984263bc
MD
1720
1721 /* Link it into the new chain */
1722 *p = n;
1723 p = &n->m_next;
1724
1725 /* Copy data from original mbuf(s) into new mbuf */
50503f0f 1726 n->m_len = 0;
984263bc
MD
1727 while (n->m_len < nsize && m != NULL) {
1728 int chunk = min(nsize - n->m_len, m->m_len - moff);
1729
1730 bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1731 moff += chunk;
1732 n->m_len += chunk;
1733 remain -= chunk;
1734 if (moff == m->m_len) {
1735 m = m->m_next;
1736 moff = 0;
1737 }
1738 }
1739
1740 /* Check correct total mbuf length */
1741 KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
50503f0f 1742 ("%s: bogus m_pkthdr.len", __func__));
984263bc
MD
1743 }
1744 return (top);
1745
1746nospace:
1747 m_freem(top);
50503f0f 1748nospace0:
461213b7 1749 ++mbstat[mycpu->gd_cpuid].m_mcfail;
50503f0f 1750 return (NULL);
984263bc
MD
1751}
1752
3bf6fec3
MD
1753/*
1754 * Copy the non-packet mbuf data chain into a new set of mbufs, including
1755 * copying any mbuf clusters. This is typically used to realign a data
1756 * chain by nfs_realign().
1757 *
b5523eac
SW
1758 * The original chain is left intact. how should be M_WAITOK or M_NOWAIT
1759 * and NULL can be returned if M_NOWAIT is passed.
3bf6fec3
MD
1760 *
1761 * Be careful to use cluster mbufs, a large mbuf chain converted to non
1762 * cluster mbufs can exhaust our supply of mbufs.
1763 */
1764struct mbuf *
1765m_dup_data(struct mbuf *m, int how)
1766{
1767 struct mbuf **p, *n, *top = NULL;
1768 int mlen, moff, chunk, gsize, nsize;
1769
1770 /*
1771 * Degenerate case
1772 */
1773 if (m == NULL)
1774 return (NULL);
1775
1776 /*
1777 * Optimize the mbuf allocation but do not get too carried away.
1778 */
1779 if (m->m_next || m->m_len > MLEN)
94eaee9a
JT
1780 if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES)
1781 gsize = MCLBYTES;
1782 else
1783 gsize = MJUMPAGESIZE;
3bf6fec3
MD
1784 else
1785 gsize = MLEN;
1786
1787 /* Chain control */
1788 p = &top;
1789 n = NULL;
1790 nsize = 0;
1791
1792 /*
1793 * Scan the mbuf chain until nothing is left, the new mbuf chain
1794 * will be allocated on the fly as needed.
1795 */
1796 while (m) {
1797 mlen = m->m_len;
1798 moff = 0;
1799
1800 while (mlen) {
1801 KKASSERT(m->m_type == MT_DATA);
1802 if (n == NULL) {
1803 n = m_getl(gsize, how, MT_DATA, 0, &nsize);
1804 n->m_len = 0;
1805 if (n == NULL)
1806 goto nospace;
1807 *p = n;
1808 p = &n->m_next;
1809 }
1810 chunk = imin(mlen, nsize);
1811 bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
1812 mlen -= chunk;
1813 moff += chunk;
1814 n->m_len += chunk;
1815 nsize -= chunk;
1816 if (nsize == 0)
1817 n = NULL;
1818 }
1819 m = m->m_next;
1820 }
1821 *p = NULL;
1822 return(top);
1823nospace:
1824 *p = NULL;
1825 m_freem(top);
461213b7 1826 ++mbstat[mycpu->gd_cpuid].m_mcfail;
3bf6fec3
MD
1827 return (NULL);
1828}
1829
984263bc
MD
1830/*
1831 * Concatenate mbuf chain n to m.
1832 * Both chains must be of the same type (e.g. MT_DATA).
1833 * Any m_pkthdr is not updated.
1834 */
1835void
8a3125c6 1836m_cat(struct mbuf *m, struct mbuf *n)
984263bc 1837{
50503f0f 1838 m = m_last(m);
984263bc
MD
1839 while (n) {
1840 if (m->m_flags & M_EXT ||
1841 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
1842 /* just join the two chains */
1843 m->m_next = n;
1844 return;
1845 }
1846 /* splat the data from one into the other */
1847 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
1848 (u_int)n->m_len);
1849 m->m_len += n->m_len;
1850 n = m_free(n);
1851 }
1852}
1853
1854void
8a3125c6 1855m_adj(struct mbuf *mp, int req_len)
984263bc 1856{
1fd87d54
RG
1857 int len = req_len;
1858 struct mbuf *m;
1859 int count;
984263bc
MD
1860
1861 if ((m = mp) == NULL)
1862 return;
1863 if (len >= 0) {
1864 /*
1865 * Trim from head.
1866 */
1867 while (m != NULL && len > 0) {
1868 if (m->m_len <= len) {
1869 len -= m->m_len;
1870 m->m_len = 0;
1871 m = m->m_next;
1872 } else {
1873 m->m_len -= len;
1874 m->m_data += len;
1875 len = 0;
1876 }
1877 }
1878 m = mp;
1879 if (mp->m_flags & M_PKTHDR)
1880 m->m_pkthdr.len -= (req_len - len);
1881 } else {
1882 /*
1883 * Trim from tail. Scan the mbuf chain,
1884 * calculating its length and finding the last mbuf.
1885 * If the adjustment only affects this mbuf, then just
1886 * adjust and return. Otherwise, rescan and truncate
1887 * after the remaining size.
1888 */
1889 len = -len;
1890 count = 0;
1891 for (;;) {
1892 count += m->m_len;
60233e58 1893 if (m->m_next == NULL)
984263bc
MD
1894 break;
1895 m = m->m_next;
1896 }
1897 if (m->m_len >= len) {
1898 m->m_len -= len;
1899 if (mp->m_flags & M_PKTHDR)
1900 mp->m_pkthdr.len -= len;
1901 return;
1902 }
1903 count -= len;
1904 if (count < 0)
1905 count = 0;
1906 /*
1907 * Correct length for chain is "count".
1908 * Find the mbuf with last data, adjust its length,
1909 * and toss data from remaining mbufs on chain.
1910 */
1911 m = mp;
1912 if (m->m_flags & M_PKTHDR)
1913 m->m_pkthdr.len = count;
1914 for (; m; m = m->m_next) {
1915 if (m->m_len >= count) {
1916 m->m_len = count;
1917 break;
1918 }
1919 count -= m->m_len;
1920 }
1921 while (m->m_next)
1922 (m = m->m_next) ->m_len = 0;
1923 }
1924}
1925
a3768f58
RP
1926/*
1927 * Set the m_data pointer of a newly-allocated mbuf
1928 * to place an object of the specified size at the
1929 * end of the mbuf, longword aligned.
1930 */
1931void
1932m_align(struct mbuf *m, int len)
1933{
1934 int adjust;
1935
1936 if (m->m_flags & M_EXT)
1937 adjust = m->m_ext.ext_size - len;
1938 else if (m->m_flags & M_PKTHDR)
1939 adjust = MHLEN - len;
1940 else
1941 adjust = MLEN - len;
1942 m->m_data += adjust &~ (sizeof(long)-1);
1943}
1944
0909f798
NA
1945/*
1946 * Create a writable copy of the mbuf chain. While doing this
1947 * we compact the chain with a goal of producing a chain with
1948 * at most two mbufs. The second mbuf in this chain is likely
1949 * to be a cluster. The primary purpose of this work is to create
1950 * a writable packet for encryption, compression, etc. The
1951 * secondary goal is to linearize the data so the data can be
1952 * passed to crypto hardware in the most efficient manner possible.
1953 */
1954struct mbuf *
1955m_unshare(struct mbuf *m0, int how)
1956{
1957 struct mbuf *m, *mprev;
1958 struct mbuf *n, *mfirst, *mlast;
1959 int len, off;
1960
1961 mprev = NULL;
1962 for (m = m0; m != NULL; m = mprev->m_next) {
1963 /*
1964 * Regular mbufs are ignored unless there's a cluster
1965 * in front of it that we can use to coalesce. We do
1966 * the latter mainly so later clusters can be coalesced
1967 * also w/o having to handle them specially (i.e. convert
1968 * mbuf+cluster -> cluster). This optimization is heavily
1969 * influenced by the assumption that we're running over
1970 * Ethernet where MCLBYTES is large enough that the max
1971 * packet size will permit lots of coalescing into a
1972 * single cluster. This in turn permits efficient
1973 * crypto operations, especially when using hardware.
1974 */
1975 if ((m->m_flags & M_EXT) == 0) {
1976 if (mprev && (mprev->m_flags & M_EXT) &&
1977 m->m_len <= M_TRAILINGSPACE(mprev)) {
1978 /* XXX: this ignores mbuf types */
1979 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
1980 mtod(m, caddr_t), m->m_len);
1981 mprev->m_len += m->m_len;
1982 mprev->m_next = m->m_next; /* unlink from chain */
1983 m_free(m); /* reclaim mbuf */
1984 } else {
1985 mprev = m;
1986 }
1987 continue;
1988 }
1989 /*
1990 * Writable mbufs are left alone (for now).
1991 */
1992 if (M_WRITABLE(m)) {
1993 mprev = m;
1994 continue;
1995 }
1996
1997 /*
1998 * Not writable, replace with a copy or coalesce with
1999 * the previous mbuf if possible (since we have to copy
2000 * it anyway, we try to reduce the number of mbufs and
2001 * clusters so that future work is easier).
2002 */
2003 KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
2004 /* NB: we only coalesce into a cluster or larger */
2005 if (mprev != NULL && (mprev->m_flags & M_EXT) &&
2006 m->m_len <= M_TRAILINGSPACE(mprev)) {
2007 /* XXX: this ignores mbuf types */
2008 memcpy(mtod(mprev, caddr_t) + mprev->m_len,
2009 mtod(m, caddr_t), m->m_len);
2010 mprev->m_len += m->m_len;
2011 mprev->m_next = m->m_next; /* unlink from chain */
2012 m_free(m); /* reclaim mbuf */
2013 continue;
2014 }
2015
2016 /*
2017 * Allocate new space to hold the copy...
2018 */
2019 /* XXX why can M_PKTHDR be set past the first mbuf? */
2020 if (mprev == NULL && (m->m_flags & M_PKTHDR)) {
2021 /*
2022 * NB: if a packet header is present we must
2023 * allocate the mbuf separately from any cluster
2024 * because M_MOVE_PKTHDR will smash the data
2025 * pointer and drop the M_EXT marker.
2026 */
2027 MGETHDR(n, how, m->m_type);
2028 if (n == NULL) {
2029 m_freem(m0);
2030 return (NULL);
2031 }
2032 M_MOVE_PKTHDR(n, m);
2033 MCLGET(n, how);
2034 if ((n->m_flags & M_EXT) == 0) {
2035 m_free(n);
2036 m_freem(m0);
2037 return (NULL);
2038 }
2039 } else {
2040 n = m_getcl(how, m->m_type, m->m_flags);
2041 if (n == NULL) {
2042 m_freem(m0);
2043 return (NULL);
2044 }
2045 }
2046 /*
2047 * ... and copy the data. We deal with jumbo mbufs
2048 * (i.e. m_len > MCLBYTES) by splitting them into
2049 * clusters. We could just malloc a buffer and make
2050 * it external but too many device drivers don't know
2051 * how to break up the non-contiguous memory when
2052 * doing DMA.
2053 */
2054 len = m->m_len;
2055 off = 0;
2056 mfirst = n;
2057 mlast = NULL;
2058 for (;;) {
2059 int cc = min(len, MCLBYTES);
2060 memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
2061 n->m_len = cc;
2062 if (mlast != NULL)
2063 mlast->m_next = n;
2064 mlast = n;
2065
2066 len -= cc;
2067 if (len <= 0)
2068 break;
2069 off += cc;
2070
2071 n = m_getcl(how, m->m_type, m->m_flags);
2072 if (n == NULL) {
2073 m_freem(mfirst);
2074 m_freem(m0);
2075 return (NULL);
2076 }
2077 }
2078 n->m_next = m->m_next;
2079 if (mprev == NULL)
2080 m0 = mfirst; /* new head of chain */
2081 else
2082 mprev->m_next = mfirst; /* replace old mbuf */
2083 m_free(m); /* release old mbuf */
2084 mprev = mfirst;
2085 }
2086 return (m0);
2087}
2088
984263bc 2089/*
7b6f875f 2090 * Rearrange an mbuf chain so that len bytes are contiguous
9e4465af
MD
2091 * and in the data area of an mbuf (so that mtod will work for a structure
2092 * of size len). Returns the resulting mbuf chain on success, frees it and
2093 * returns null on failure. If there is room, it will add up to
2094 * max_protohdr-len extra bytes to the contiguous region in an attempt to
2095 * avoid being called next time.
984263bc 2096 */
984263bc 2097struct mbuf *
8a3125c6 2098m_pullup(struct mbuf *n, int len)
984263bc 2099{
1fd87d54
RG
2100 struct mbuf *m;
2101 int count;
984263bc
MD
2102 int space;
2103
2104 /*
2105 * If first mbuf has no cluster, and has room for len bytes
2106 * without shifting current data, pullup into it,
2107 * otherwise allocate a new mbuf to prepend to the chain.
2108 */
7b6f875f
JH
2109 if (!(n->m_flags & M_EXT) &&
2110 n->m_data + len < &n->m_dat[MLEN] &&
2111 n->m_next) {
984263bc
MD
2112 if (n->m_len >= len)
2113 return (n);
2114 m = n;
2115 n = n->m_next;
2116 len -= m->m_len;
2117 } else {
2118 if (len > MHLEN)
2119 goto bad;
c3ef87ca 2120 if (n->m_flags & M_PKTHDR)
b5523eac 2121 m = m_gethdr(M_NOWAIT, n->m_type);
c3ef87ca 2122 else
b5523eac 2123 m = m_get(M_NOWAIT, n->m_type);
7b6f875f 2124 if (m == NULL)
984263bc
MD
2125 goto bad;
2126 m->m_len = 0;
2127 if (n->m_flags & M_PKTHDR)
2128 M_MOVE_PKTHDR(m, n);
2129 }
2130 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
2131 do {
2132 count = min(min(max(len, max_protohdr), space), n->m_len);
2133 bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
2134 (unsigned)count);
2135 len -= count;
2136 m->m_len += count;
2137 n->m_len -= count;
2138 space -= count;
2139 if (n->m_len)
2140 n->m_data += count;
2141 else
2142 n = m_free(n);
2143 } while (len > 0 && n);
2144 if (len > 0) {
7b6f875f 2145 m_free(m);
984263bc
MD
2146 goto bad;
2147 }
2148 m->m_next = n;
2149 return (m);
2150bad:
2151 m_freem(n);
461213b7 2152 ++mbstat[mycpu->gd_cpuid].m_mcfail;
7b6f875f 2153 return (NULL);
984263bc
MD
2154}
2155
2156/*
2157 * Partition an mbuf chain in two pieces, returning the tail --
2158 * all but the first len0 bytes. In case of failure, it returns NULL and
2159 * attempts to restore the chain to its original state.
2160 *
2161 * Note that the resulting mbufs might be read-only, because the new
2162 * mbuf can end up sharing an mbuf cluster with the original mbuf if
2163 * the "breaking point" happens to lie within a cluster mbuf. Use the
2164 * M_WRITABLE() macro to check for this case.
2165 */
2166struct mbuf *
8a3125c6 2167m_split(struct mbuf *m0, int len0, int wait)
984263bc 2168{
1fd87d54 2169 struct mbuf *m, *n;
984263bc
MD
2170 unsigned len = len0, remain;
2171
2172 for (m = m0; m && len > m->m_len; m = m->m_next)
2173 len -= m->m_len;
7b6f875f
JH
2174 if (m == NULL)
2175 return (NULL);
984263bc
MD
2176 remain = m->m_len - len;
2177 if (m0->m_flags & M_PKTHDR) {
7b6f875f
JH
2178 n = m_gethdr(wait, m0->m_type);
2179 if (n == NULL)
2180 return (NULL);
984263bc
MD
2181 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
2182 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
2183 m0->m_pkthdr.len = len0;
2184 if (m->m_flags & M_EXT)
2185 goto extpacket;
2186 if (remain > MHLEN) {
2187 /* m can't be the lead packet */
2188 MH_ALIGN(n, 0);
2189 n->m_next = m_split(m, len, wait);
7b6f875f
JH
2190 if (n->m_next == NULL) {
2191 m_free(n);
2192 return (NULL);
984263bc
MD
2193 } else {
2194 n->m_len = 0;
2195 return (n);
2196 }
2197 } else
2198 MH_ALIGN(n, remain);
2199 } else if (remain == 0) {
2200 n = m->m_next;
d8061892 2201 m->m_next = NULL;
984263bc
MD
2202 return (n);
2203 } else {
7b6f875f
JH
2204 n = m_get(wait, m->m_type);
2205 if (n == NULL)
2206 return (NULL);
984263bc
MD
2207 M_ALIGN(n, remain);
2208 }
2209extpacket:
2210 if (m->m_flags & M_EXT) {
c3ef87ca 2211 KKASSERT((n->m_flags & M_EXT) == 0);
984263bc 2212 n->m_data = m->m_data + len;
7b6f875f 2213 m->m_ext.ext_ref(m->m_ext.ext_arg);
7eccf245 2214 n->m_ext = m->m_ext;
b542cd49 2215 n->m_flags |= m->m_flags & (M_EXT | M_EXT_CLUSTER);
984263bc
MD
2216 } else {
2217 bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
2218 }
2219 n->m_len = remain;
2220 m->m_len = len;
2221 n->m_next = m->m_next;
d8061892 2222 m->m_next = NULL;
984263bc
MD
2223 return (n);
2224}
50503f0f 2225
984263bc
MD
2226/*
2227 * Routine to copy from device local memory into mbufs.
50503f0f 2228 * Note: "offset" is ill-defined and always called as 0, so ignore it.
984263bc
MD
2229 */
2230struct mbuf *
50503f0f
JH
2231m_devget(char *buf, int len, int offset, struct ifnet *ifp,
2232 void (*copy)(volatile const void *from, volatile void *to, size_t length))
984263bc 2233{
50503f0f
JH
2234 struct mbuf *m, *mfirst = NULL, **mtail;
2235 int nsize, flags;
2236
2237 if (copy == NULL)
2238 copy = bcopy;
2239 mtail = &mfirst;
2240 flags = M_PKTHDR;
2241
2242 while (len > 0) {
b5523eac 2243 m = m_getl(len, M_NOWAIT, MT_DATA, flags, &nsize);
50503f0f
JH
2244 if (m == NULL) {
2245 m_freem(mfirst);
2246 return (NULL);
984263bc 2247 }
50503f0f
JH
2248 m->m_len = min(len, nsize);
2249
2250 if (flags & M_PKTHDR) {
2251 if (len + max_linkhdr <= nsize)
2252 m->m_data += max_linkhdr;
2253 m->m_pkthdr.rcvif = ifp;
2254 m->m_pkthdr.len = len;
2255 flags = 0;
984263bc 2256 }
50503f0f
JH
2257
2258 copy(buf, m->m_data, (unsigned)m->m_len);
2259 buf += m->m_len;
2260 len -= m->m_len;
2261 *mtail = m;
2262 mtail = &m->m_next;
984263bc 2263 }
50503f0f
JH
2264
2265 return (mfirst);
984263bc
MD
2266}
2267
cf12ba3c
SZ
2268/*
2269 * Routine to pad mbuf to the specified length 'padto'.
2270 */
2271int
2272m_devpad(struct mbuf *m, int padto)
2273{
2274 struct mbuf *last = NULL;
2275 int padlen;
2276
2277 if (padto <= m->m_pkthdr.len)
2278 return 0;
2279
2280 padlen = padto - m->m_pkthdr.len;
2281
2282 /* if there's only the packet-header and we can pad there, use it. */
2283 if (m->m_pkthdr.len == m->m_len && M_TRAILINGSPACE(m) >= padlen) {
2284 last = m;
2285 } else {
2286 /*
2287 * Walk packet chain to find last mbuf. We will either
2288 * pad there, or append a new mbuf and pad it
2289 */
2290 for (last = m; last->m_next != NULL; last = last->m_next)
2291 ; /* EMPTY */
2292
2293 /* `last' now points to last in chain. */
2294 if (M_TRAILINGSPACE(last) < padlen) {
2295 struct mbuf *n;
2296
2297 /* Allocate new empty mbuf, pad it. Compact later. */
b5523eac 2298 MGET(n, M_NOWAIT, MT_DATA);
cf12ba3c
SZ
2299 if (n == NULL)
2300 return ENOBUFS;
2301 n->m_len = 0;
2302 last->m_next = n;
2303 last = n;
2304 }
2305 }
2306 KKASSERT(M_TRAILINGSPACE(last) >= padlen);
2307 KKASSERT(M_WRITABLE(last));
2308
2309 /* Now zero the pad area */
2310 bzero(mtod(last, char *) + last->m_len, padlen);
2311 last->m_len += padlen;
2312 m->m_pkthdr.len += padlen;
2313 return 0;
2314}
2315
984263bc
MD
2316/*
2317 * Copy data from a buffer back into the indicated mbuf chain,
2318 * starting "off" bytes from the beginning, extending the mbuf
2319 * chain if necessary.
2320 */
2321void
8a3125c6 2322m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
984263bc 2323{
1fd87d54
RG
2324 int mlen;
2325 struct mbuf *m = m0, *n;
984263bc
MD
2326 int totlen = 0;
2327
7b6f875f 2328 if (m0 == NULL)
984263bc
MD
2329 return;
2330 while (off > (mlen = m->m_len)) {
2331 off -= mlen;
2332 totlen += mlen;
7b6f875f 2333 if (m->m_next == NULL) {
b5523eac 2334 n = m_getclr(M_NOWAIT, m->m_type);
7b6f875f 2335 if (n == NULL)
984263bc
MD
2336 goto out;
2337 n->m_len = min(MLEN, len + off);
2338 m->m_next = n;
2339 }
2340 m = m->m_next;
2341 }
2342 while (len > 0) {
2343 mlen = min (m->m_len - off, len);
2344 bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
2345 cp += mlen;
2346 len -= mlen;
2347 mlen += off;
2348 off = 0;
2349 totlen += mlen;
2350 if (len == 0)
2351 break;
7b6f875f 2352 if (m->m_next == NULL) {
b5523eac 2353 n = m_get(M_NOWAIT, m->m_type);
7b6f875f 2354 if (n == NULL)
984263bc
MD
2355 break;
2356 n->m_len = min(MLEN, len);
2357 m->m_next = n;
2358 }
2359 m = m->m_next;
2360 }
2361out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
2362 m->m_pkthdr.len = totlen;
2363}
2364
bf2cc98c
RP
2365/*
2366 * Append the specified data to the indicated mbuf chain,
2367 * Extend the mbuf chain if the new data does not fit in
2368 * existing space.
2369 *
2370 * Return 1 if able to complete the job; otherwise 0.
2371 */
2372int
2373m_append(struct mbuf *m0, int len, c_caddr_t cp)
2374{
2375 struct mbuf *m, *n;
2376 int remainder, space;
2377
2378 for (m = m0; m->m_next != NULL; m = m->m_next)
2379 ;
2380 remainder = len;
2381 space = M_TRAILINGSPACE(m);
2382 if (space > 0) {
2383 /*
2384 * Copy into available space.
2385 */
2386 if (space > remainder)
2387 space = remainder;
2388 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2389 m->m_len += space;
2390 cp += space, remainder -= space;
2391 }
2392 while (remainder > 0) {
2393 /*
2394 * Allocate a new mbuf; could check space
2395 * and allocate a cluster instead.
2396 */
b5523eac 2397 n = m_get(M_NOWAIT, m->m_type);
bf2cc98c
RP
2398 if (n == NULL)
2399 break;
2400 n->m_len = min(MLEN, remainder);
2401 bcopy(cp, mtod(n, caddr_t), n->m_len);
2402 cp += n->m_len, remainder -= n->m_len;
2403 m->m_next = n;
2404 m = n;
2405 }
2406 if (m0->m_flags & M_PKTHDR)
2407 m0->m_pkthdr.len += len - remainder;
2408 return (remainder == 0);
2409}
2410
920c9f10
AH
2411/*
2412 * Apply function f to the data in an mbuf chain starting "off" bytes from
2413 * the beginning, continuing for "len" bytes.
2414 */
2415int
2416m_apply(struct mbuf *m, int off, int len,
2417 int (*f)(void *, void *, u_int), void *arg)
2418{
2419 u_int count;
2420 int rval;
2421
2422 KASSERT(off >= 0, ("m_apply, negative off %d", off));
2423 KASSERT(len >= 0, ("m_apply, negative len %d", len));
2424 while (off > 0) {
2425 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2426 if (off < m->m_len)
2427 break;
2428 off -= m->m_len;
2429 m = m->m_next;
2430 }
2431 while (len > 0) {
2432 KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
2433 count = min(m->m_len - off, len);
2434 rval = (*f)(arg, mtod(m, caddr_t) + off, count);
2435 if (rval)
2436 return (rval);
2437 len -= count;
2438 off = 0;
2439 m = m->m_next;
2440 }
2441 return (0);
2442}
2443
2444/*
2445 * Return a pointer to mbuf/offset of location in mbuf chain.
2446 */
2447struct mbuf *
2448m_getptr(struct mbuf *m, int loc, int *off)
2449{
2450
2451 while (loc >= 0) {
2452 /* Normal end of search. */
2453 if (m->m_len > loc) {
2454 *off = loc;
2455 return (m);
2456 } else {
2457 loc -= m->m_len;
2458 if (m->m_next == NULL) {
2459 if (loc == 0) {
2460 /* Point at the end of valid data. */
2461 *off = m->m_len;
2462 return (m);
2463 }
2464 return (NULL);
2465 }
2466 m = m->m_next;
2467 }
2468 }
2469 return (NULL);
2470}
2471
984263bc
MD
2472void
2473m_print(const struct mbuf *m)
2474{
2475 int len;
2476 const struct mbuf *m2;
f69e505f 2477 char *hexstr;
984263bc
MD
2478
2479 len = m->m_pkthdr.len;
2480 m2 = m;
f69e505f 2481 hexstr = kmalloc(HEX_NCPYLEN(len), M_TEMP, M_ZERO | M_WAITOK);
984263bc 2482 while (len) {
f69e505f
AHJ
2483 kprintf("%p %s\n", m2, hexncpy(m2->m_data, m2->m_len, hexstr,
2484 HEX_NCPYLEN(m2->m_len), "-"));
984263bc
MD
2485 len -= m2->m_len;
2486 m2 = m2->m_next;
2487 }
f69e505f 2488 kfree(hexstr, M_TEMP);
984263bc
MD
2489 return;
2490}
2491
2492/*
2493 * "Move" mbuf pkthdr from "from" to "to".
2494 * "from" must have M_PKTHDR set, and "to" must be empty.
2495 */
2496void
2497m_move_pkthdr(struct mbuf *to, struct mbuf *from)
2498{
e0d05288 2499 KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
984263bc 2500
77e294a1 2501 to->m_flags |= from->m_flags & M_COPYFLAGS;
984263bc
MD
2502 to->m_pkthdr = from->m_pkthdr; /* especially tags */
2503 SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */
984263bc
MD
2504}
2505
2506/*
2507 * Duplicate "from"'s mbuf pkthdr in "to".
2508 * "from" must have M_PKTHDR set, and "to" must be empty.
2509 * In particular, this does a deep copy of the packet tags.
2510 */
2511int
f15db79e 2512m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how)
984263bc 2513{
7f3602fe
JH
2514 KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
2515
4bac35fc 2516 to->m_flags = (from->m_flags & M_COPYFLAGS) |
c4da22e4 2517 (to->m_flags & ~M_COPYFLAGS);
984263bc
MD
2518 to->m_pkthdr = from->m_pkthdr;
2519 SLIST_INIT(&to->m_pkthdr.tags);
2520 return (m_tag_copy_chain(to, from, how));
2521}
2522
2523/*
2524 * Defragment a mbuf chain, returning the shortest possible
2525 * chain of mbufs and clusters. If allocation fails and
2526 * this cannot be completed, NULL will be returned, but
2527 * the passed in chain will be unchanged. Upon success,
2528 * the original chain will be freed, and the new chain
2529 * will be returned.
2530 *
2531 * If a non-packet header is passed in, the original
2532 * mbuf (chain?) will be returned unharmed.
c8f5127a
JS
2533 *
2534 * m_defrag_nofree doesn't free the passed in mbuf.
984263bc
MD
2535 */
2536struct mbuf *
2537m_defrag(struct mbuf *m0, int how)
c8f5127a
JS
2538{
2539 struct mbuf *m_new;
2540
2541 if ((m_new = m_defrag_nofree(m0, how)) == NULL)
2542 return (NULL);
2543 if (m_new != m0)
2544 m_freem(m0);
2545 return (m_new);
2546}
2547
2548struct mbuf *
2549m_defrag_nofree(struct mbuf *m0, int how)
984263bc
MD
2550{
2551 struct mbuf *m_new = NULL, *m_final = NULL;
61721e90 2552 int progress = 0, length, nsize;
984263bc
MD
2553
2554 if (!(m0->m_flags & M_PKTHDR))
2555 return (m0);
2556
2557#ifdef MBUF_STRESS_TEST
2558 if (m_defragrandomfailures) {
0ced1954 2559 int temp = karc4random() & 0xff;
984263bc
MD
2560 if (temp == 0xba)
2561 goto nospace;
2562 }
2563#endif
2564
61721e90 2565 m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
984263bc
MD
2566 if (m_final == NULL)
2567 goto nospace;
61721e90 2568 m_final->m_len = 0; /* in case m0->m_pkthdr.len is zero */
984263bc 2569
3641b7ca 2570 if (m_dup_pkthdr(m_final, m0, how) == 0)
984263bc
MD
2571 goto nospace;
2572
2573 m_new = m_final;
2574
2575 while (progress < m0->m_pkthdr.len) {
2576 length = m0->m_pkthdr.len - progress;
2577 if (length > MCLBYTES)
2578 length = MCLBYTES;
2579
2580 if (m_new == NULL) {
61721e90 2581 m_new = m_getl(length, how, MT_DATA, 0, &nsize);
984263bc
MD
2582 if (m_new == NULL)
2583 goto nospace;
2584 }
2585
2586 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
2587 progress += length;
2588 m_new->m_len = length;
2589 if (m_new != m_final)
2590 m_cat(m_final, m_new);
2591 m_new = NULL;
2592 }
2593 if (m0->m_next == NULL)
2594 m_defraguseless++;
984263bc 2595 m_defragpackets++;
c8f5127a
JS
2596 m_defragbytes += m_final->m_pkthdr.len;
2597 return (m_final);
984263bc
MD
2598nospace:
2599 m_defragfailure++;
2600 if (m_new)
2601 m_free(m_new);
61721e90 2602 m_freem(m_final);
984263bc
MD
2603 return (NULL);
2604}
0c33f36d
JH
2605
2606/*
2607 * Move data from uio into mbufs.
0c33f36d
JH
2608 */
2609struct mbuf *
e12241e1 2610m_uiomove(struct uio *uio)
0c33f36d 2611{
0c33f36d 2612 struct mbuf *m; /* current working mbuf */
e12241e1
JH
2613 struct mbuf *head = NULL; /* result mbuf chain */
2614 struct mbuf **mp = &head;
e54488bb
MD
2615 int flags = M_PKTHDR;
2616 int nsize;
2617 int error;
2618 int resid;
0c33f36d 2619
0c33f36d 2620 do {
e54488bb
MD
2621 if (uio->uio_resid > INT_MAX)
2622 resid = INT_MAX;
2623 else
2624 resid = (int)uio->uio_resid;
b5523eac 2625 m = m_getl(resid, M_WAITOK, MT_DATA, flags, &nsize);
61721e90
JH
2626 if (flags) {
2627 m->m_pkthdr.len = 0;
2628 /* Leave room for protocol headers. */
2629 if (resid < MHLEN)
2630 MH_ALIGN(m, resid);
2631 flags = 0;
0c33f36d 2632 }
e54488bb 2633 m->m_len = imin(nsize, resid);
61721e90 2634 error = uiomove(mtod(m, caddr_t), m->m_len, uio);
0c33f36d
JH
2635 if (error) {
2636 m_free(m);
2637 goto failed;
2638 }
0c33f36d
JH
2639 *mp = m;
2640 mp = &m->m_next;
61721e90 2641 head->m_pkthdr.len += m->m_len;
e54488bb 2642 } while (uio->uio_resid > 0);
0c33f36d
JH
2643
2644 return (head);
2645
2646failed:
61721e90 2647 m_freem(head);
0c33f36d
JH
2648 return (NULL);
2649}
df80f2ea 2650
50503f0f
JH
2651struct mbuf *
2652m_last(struct mbuf *m)
2653{
2654 while (m->m_next)
2655 m = m->m_next;
2656 return (m);
2657}
2658
df80f2ea
JH
2659/*
2660 * Return the number of bytes in an mbuf chain.
2661 * If lastm is not NULL, also return the last mbuf.
2662 */
2663u_int
2664m_lengthm(struct mbuf *m, struct mbuf **lastm)
2665{
2666 u_int len = 0;
2667 struct mbuf *prev = m;
2668
2669 while (m) {
2670 len += m->m_len;
2671 prev = m;
2672 m = m->m_next;
2673 }
2674 if (lastm != NULL)
2675 *lastm = prev;
2676 return (len);
2677}
2678
2679/*
2680 * Like m_lengthm(), except also keep track of mbuf usage.
2681 */
2682u_int
2683m_countm(struct mbuf *m, struct mbuf **lastm, u_int *pmbcnt)
2684{
2685 u_int len = 0, mbcnt = 0;
2686 struct mbuf *prev = m;
2687
2688 while (m) {
2689 len += m->m_len;
2690 mbcnt += MSIZE;
2691 if (m->m_flags & M_EXT)
2692 mbcnt += m->m_ext.ext_size;
2693 prev = m;
2694 m = m->m_next;
2695 }
2696 if (lastm != NULL)
2697 *lastm = prev;
2698 *pmbcnt = mbcnt;
2699 return (len);
2700}