gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
	3	* Copyright (c) 2004 The DragonFly Project. All rights reserved.
	4	*
	5	* This code is derived from software contributed to The DragonFly Project
	6	* by Jeffrey M. Hsu.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. Neither the name of The DragonFly Project nor the names of its
	17	* contributors may be used to endorse or promote products derived
	18	* from this software without specific, prior written permission.
	19	*
	20	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	21	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	22	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	23	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	24	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	26	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	27	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	28	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	29	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	30	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	31	* SUCH DAMAGE.
	32	*/
	33
	34	/*
	35	* Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved.
	36	*
	37	* License terms: all terms for the DragonFly license above plus the following:
	38	*
	39	* 4. All advertising materials mentioning features or use of this software
	40	* must display the following acknowledgement:
	41	*
	42	* This product includes software developed by Jeffrey M. Hsu
	43	* for the DragonFly Project.
	44	*
	45	* This requirement may be waived with permission from Jeffrey Hsu.
	46	* This requirement will sunset and may be removed on July 8 2005,
	47	* after which the standard DragonFly license (as shown above) will
	48	* apply.
	49	*/
	50
	51	/*
	52	* Copyright (c) 1982, 1986, 1988, 1991, 1993
	53	* The Regents of the University of California. All rights reserved.
	54	*
	55	* Redistribution and use in source and binary forms, with or without
	56	* modification, are permitted provided that the following conditions
	57	* are met:
	58	* 1. Redistributions of source code must retain the above copyright
	59	* notice, this list of conditions and the following disclaimer.
	60	* 2. Redistributions in binary form must reproduce the above copyright
	61	* notice, this list of conditions and the following disclaimer in the
	62	* documentation and/or other materials provided with the distribution.
	63	* 3. All advertising materials mentioning features or use of this software
	64	* must display the following acknowledgement:
	65	* This product includes software developed by the University of
	66	* California, Berkeley and its contributors.
	67	* 4. Neither the name of the University nor the names of its contributors
	68	* may be used to endorse or promote products derived from this software
	69	* without specific prior written permission.
	70	*
	71	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	72	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	73	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	74	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	75	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	76	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	77	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	78	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	79	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	80	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	81	* SUCH DAMAGE.
	82	*
	83	* @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
	84	* $FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.51.2.24 2003/04/15 06:59:29 silby Exp $
	85	* $DragonFly: src/sys/kern/uipc_mbuf.c,v 1.58 2006/12/17 19:28:30 dillon Exp $
	86	*/
	87
	88	#include "opt_param.h"
	89	#include "opt_ddb.h"
	90	#include "opt_mbuf_stress_test.h"
	91	#include <sys/param.h>
	92	#include <sys/systm.h>
	93	#include <sys/malloc.h>
	94	#include <sys/mbuf.h>
	95	#include <sys/kernel.h>
	96	#include <sys/sysctl.h>
	97	#include <sys/domain.h>
	98	#include <sys/objcache.h>
	99	#include <sys/protosw.h>
	100	#include <sys/uio.h>
	101	#include <sys/thread.h>
	102	#include <sys/globaldata.h>
	103	#include <sys/serialize.h>
	104	#include <sys/thread2.h>
	105
	106	#include <vm/vm.h>
	107	#include <vm/vm_kern.h>
	108	#include <vm/vm_extern.h>
	109
	110	#ifdef INVARIANTS
	111	#include <machine/cpu.h>
	112	#endif
	113
	114	/*
	115	* mbuf cluster meta-data
	116	*/
	117	struct mbcluster {
	118	int32_t mcl_refs;
	119	void *mcl_data;
	120	struct lwkt_serialize mcl_serializer;
	121	};
	122
	123	static void mbinit(void *);
	124	SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
	125
	126	static u_long mbtypes[MT_NTYPES];
	127
	128	struct mbstat mbstat;
	129	int max_linkhdr;
	130	int max_protohdr;
	131	int max_hdr;
	132	int max_datalen;
	133	int m_defragpackets;
	134	int m_defragbytes;
	135	int m_defraguseless;
	136	int m_defragfailure;
	137	#ifdef MBUF_STRESS_TEST
	138	int m_defragrandomfailures;
	139	#endif
	140
	141	struct objcache mbuf_cache, mbufphdr_cache;
	142	struct objcache *mclmeta_cache;
	143	struct objcache mbufcluster_cache, mbufphdrcluster_cache;
	144
	145	int nmbclusters;
	146	int nmbufs;
	147
	148	SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
	149	&max_linkhdr, 0, "");
	150	SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
	151	&max_protohdr, 0, "");
	152	SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
	153	SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
	154	&max_datalen, 0, "");
	155	SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW,
	156	&mbuf_wait, 0, "");
	157	SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
	158	SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mbtypes, CTLFLAG_RD, mbtypes,
	159	sizeof(mbtypes), "LU", "");
	160	SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RW,
	161	&nmbclusters, 0, "Maximum number of mbuf clusters available");
	162	SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RW, &nmbufs, 0,
	163	"Maximum number of mbufs available");
	164
	165	SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
	166	&m_defragpackets, 0, "");
	167	SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
	168	&m_defragbytes, 0, "");
	169	SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
	170	&m_defraguseless, 0, "");
	171	SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
	172	&m_defragfailure, 0, "");
	173	#ifdef MBUF_STRESS_TEST
	174	SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
	175	&m_defragrandomfailures, 0, "");
	176	#endif
	177
	178	static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
	179	static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl");
	180	static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta");
	181
	182	static void m_reclaim (void);
	183	static void m_mclref(void *arg);
	184	static void m_mclfree(void *arg);
	185
	186	#ifndef NMBCLUSTERS
	187	#define NMBCLUSTERS (512 + maxusers * 16)
	188	#endif
	189	#ifndef NMBUFS
	190	#define NMBUFS (nmbclusters * 2)
	191	#endif
	192
	193	/*
	194	* Perform sanity checks of tunables declared above.
	195	*/
	196	static void
	197	tunable_mbinit(void *dummy)
	198	{
	199
	200	/*
	201	* This has to be done before VM init.
	202	*/
	203	nmbclusters = NMBCLUSTERS;
	204	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
	205	nmbufs = NMBUFS;
	206	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
	207	/* Sanity checks */
	208	if (nmbufs < nmbclusters * 2)
	209	nmbufs = nmbclusters * 2;
	210
	211	return;
	212	}
	213	SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
	214
	215	/* "number of clusters of pages" */
	216	#define NCL_INIT 1
	217
	218	#define NMB_INIT 16
	219
	220	/*
	221	* The mbuf object cache only guarantees that m_next and m_nextpkt are
	222	* NULL and that m_data points to the beginning of the data area. In
	223	* particular, m_len and m_pkthdr.len are uninitialized. It is the
	224	* responsibility of the caller to initialize those fields before use.
	225	*/
	226
	227	static boolean_t __inline
	228	mbuf_ctor(void obj, void private, int ocflags)
	229	{
	230	struct mbuf *m = obj;
	231
	232	m->m_next = NULL;
	233	m->m_nextpkt = NULL;
	234	m->m_data = m->m_dat;
	235	m->m_flags = 0;
	236
	237	return (TRUE);
	238	}
	239
	240	/*
	241	* Initialize the mbuf and the packet header fields.
	242	*/
	243	static boolean_t
	244	mbufphdr_ctor(void obj, void private, int ocflags)
	245	{
	246	struct mbuf *m = obj;
	247
	248	m->m_next = NULL;
	249	m->m_nextpkt = NULL;
	250	m->m_data = m->m_pktdat;
	251	m->m_flags = M_PKTHDR \| M_PHCACHE;
	252
	253	m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */
	254	SLIST_INIT(&m->m_pkthdr.tags);
	255	m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */
	256	m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */
	257
	258	return (TRUE);
	259	}
	260
	261	/*
	262	* A mbcluster object consists of 2K (MCLBYTES) cluster and a refcount.
	263	*/
	264	static boolean_t
	265	mclmeta_ctor(void obj, void private, int ocflags)
	266	{
	267	struct mbcluster *cl = obj;
	268	void *buf;
	269
	270	if (ocflags & M_NOWAIT)
	271	buf = kmalloc(MCLBYTES, M_MBUFCL, M_NOWAIT \| M_ZERO);
	272	else
	273	buf = kmalloc(MCLBYTES, M_MBUFCL, M_INTWAIT \| M_ZERO);
	274	if (buf == NULL)
	275	return (FALSE);
	276	cl->mcl_refs = 0;
	277	cl->mcl_data = buf;
	278	lwkt_serialize_init(&cl->mcl_serializer);
	279	return (TRUE);
	280	}
	281
	282	static void
	283	mclmeta_dtor(void obj, void private)
	284	{
	285	struct mbcluster *mcl = obj;
	286
	287	KKASSERT(mcl->mcl_refs == 0);
	288	kfree(mcl->mcl_data, M_MBUFCL);
	289	}
	290
	291	static void
	292	linkcluster(struct mbuf m, struct mbcluster cl)
	293	{
	294	/*
	295	* Add the cluster to the mbuf. The caller will detect that the
	296	* mbuf now has an attached cluster.
	297	*/
	298	m->m_ext.ext_arg = cl;
	299	m->m_ext.ext_buf = cl->mcl_data;
	300	m->m_ext.ext_ref = m_mclref;
	301	m->m_ext.ext_free = m_mclfree;
	302	m->m_ext.ext_size = MCLBYTES;
	303	atomic_add_int(&cl->mcl_refs, 1);
	304
	305	m->m_data = m->m_ext.ext_buf;
	306	m->m_flags \|= M_EXT \| M_EXT_CLUSTER;
	307	}
	308
	309	static boolean_t
	310	mbufphdrcluster_ctor(void obj, void private, int ocflags)
	311	{
	312	struct mbuf *m = obj;
	313	struct mbcluster *cl;
	314
	315	mbufphdr_ctor(obj, private, ocflags);
	316	cl = objcache_get(mclmeta_cache, ocflags);
	317	if (cl == NULL)
	318	return (FALSE);
	319	m->m_flags \|= M_CLCACHE;
	320	linkcluster(m, cl);
	321	return (TRUE);
	322	}
	323
	324	static boolean_t
	325	mbufcluster_ctor(void obj, void private, int ocflags)
	326	{
	327	struct mbuf *m = obj;
	328	struct mbcluster *cl;
	329
	330	mbuf_ctor(obj, private, ocflags);
	331	cl = objcache_get(mclmeta_cache, ocflags);
	332	if (cl == NULL)
	333	return (FALSE);
	334	m->m_flags \|= M_CLCACHE;
	335	linkcluster(m, cl);
	336	return (TRUE);
	337	}
	338
	339	/*
	340	* Used for both the cluster and cluster PHDR caches.
	341	*
	342	* The mbuf may have lost its cluster due to sharing, deal
	343	* with the situation by checking M_EXT.
	344	*/
	345	static void
	346	mbufcluster_dtor(void obj, void private)
	347	{
	348	struct mbuf *m = obj;
	349	struct mbcluster *mcl;
	350
	351	if (m->m_flags & M_EXT) {
	352	KKASSERT((m->m_flags & M_EXT_CLUSTER) != 0);
	353	mcl = m->m_ext.ext_arg;
	354	KKASSERT(mcl->mcl_refs == 1);
	355	mcl->mcl_refs = 0;
	356	objcache_put(mclmeta_cache, mcl);
	357	}
	358	}
	359
	360	struct objcache_malloc_args mbuf_malloc_args = { MSIZE, M_MBUF };
	361	struct objcache_malloc_args mclmeta_malloc_args =
	362	{ sizeof(struct mbcluster), M_MCLMETA };
	363
	364	/* ARGSUSED*/
	365	static void
	366	mbinit(void *dummy)
	367	{
	368	mbstat.m_msize = MSIZE;
	369	mbstat.m_mclbytes = MCLBYTES;
	370	mbstat.m_minclsize = MINCLSIZE;
	371	mbstat.m_mlen = MLEN;
	372	mbstat.m_mhlen = MHLEN;
	373
	374	mbuf_cache = objcache_create("mbuf", nmbufs, 0,
	375	mbuf_ctor, NULL, NULL,
	376	objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
	377	mbufphdr_cache = objcache_create("mbuf pkt hdr", nmbufs, 64,
	378	mbufphdr_ctor, NULL, NULL,
	379	objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
	380	mclmeta_cache = objcache_create("cluster mbuf", nmbclusters , 0,
	381	mclmeta_ctor, mclmeta_dtor, NULL,
	382	objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args);
	383	mbufcluster_cache = objcache_create("mbuf + cluster", nmbclusters, 0,
	384	mbufcluster_ctor, mbufcluster_dtor, NULL,
	385	objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
	386	mbufphdrcluster_cache = objcache_create("mbuf pkt hdr + cluster",
	387	nmbclusters, 64, mbufphdrcluster_ctor, mbufcluster_dtor, NULL,
	388	objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args);
	389	return;
	390	}
	391
	392	/*
	393	* Return the number of references to this mbuf's data. 0 is returned
	394	* if the mbuf is not M_EXT, a reference count is returned if it is
	395	* M_EXT \| M_EXT_CLUSTER, and 99 is returned if it is a special M_EXT.
	396	*/
	397	int
	398	m_sharecount(struct mbuf *m)
	399	{
	400	switch (m->m_flags & (M_EXT \| M_EXT_CLUSTER)) {
	401	case 0:
	402	return (0);
	403	case M_EXT:
	404	return (99);
	405	case M_EXT \| M_EXT_CLUSTER:
	406	return (((struct mbcluster *)m->m_ext.ext_arg)->mcl_refs);
	407	}
	408	/* NOTREACHED */
	409	return (0); /* to shut up compiler */
	410	}
	411
	412	/*
	413	* change mbuf to new type
	414	*/
	415	void
	416	m_chtype(struct mbuf *m, int type)
	417	{
	418	crit_enter();
	419	++mbtypes[type];
	420	--mbtypes[m->m_type];
	421	m->m_type = type;
	422	crit_exit();
	423	}
	424
	425	static void
	426	m_reclaim(void)
	427	{
	428	struct domain *dp;
	429	struct protosw *pr;
	430
	431	crit_enter();
	432	SLIST_FOREACH(dp, &domains, dom_next) {
	433	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
	434	if (pr->pr_drain)
	435	(*pr->pr_drain)();
	436	}
	437	}
	438	crit_exit();
	439	mbstat.m_drain++;
	440	}
	441
	442	static void __inline
	443	updatestats(struct mbuf *m, int type)
	444	{
	445	m->m_type = type;
	446
	447	crit_enter();
	448	++mbtypes[type];
	449	++mbstat.m_mbufs;
	450	crit_exit();
	451	}
	452
	453	/*
	454	* Allocate an mbuf.
	455	*/
	456	struct mbuf *
	457	m_get(int how, int type)
	458	{
	459	struct mbuf *m;
	460	int ntries = 0;
	461	int ocf = MBTOM(how);
	462
	463	retryonce:
	464
	465	m = objcache_get(mbuf_cache, ocf);
	466
	467	if (m == NULL) {
	468	if ((how & MB_TRYWAIT) && ntries++ == 0) {
	469	struct objcache *reclaimlist[] = {
	470	mbufphdr_cache,
	471	mbufcluster_cache, mbufphdrcluster_cache
	472	};
	473	const int nreclaims = __arysize(reclaimlist);
	474
	475	if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
	476	m_reclaim();
	477	goto retryonce;
	478	}
	479	return (NULL);
	480	}
	481
	482	updatestats(m, type);
	483	return (m);
	484	}
	485
	486	struct mbuf *
	487	m_gethdr(int how, int type)
	488	{
	489	struct mbuf *m;
	490	int ocf = MBTOM(how);
	491	int ntries = 0;
	492
	493	retryonce:
	494
	495	m = objcache_get(mbufphdr_cache, ocf);
	496
	497	if (m == NULL) {
	498	if ((how & MB_TRYWAIT) && ntries++ == 0) {
	499	struct objcache *reclaimlist[] = {
	500	mbuf_cache,
	501	mbufcluster_cache, mbufphdrcluster_cache
	502	};
	503	const int nreclaims = __arysize(reclaimlist);
	504
	505	if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf))
	506	m_reclaim();
	507	goto retryonce;
	508	}
	509	return (NULL);
	510	}
	511
	512	updatestats(m, type);
	513	return (m);
	514	}
	515
	516	/*
	517	* Get a mbuf (not a mbuf cluster!) and zero it.
	518	* Deprecated.
	519	*/
	520	struct mbuf *
	521	m_getclr(int how, int type)
	522	{
	523	struct mbuf *m;
	524
	525	m = m_get(how, type);
	526	if (m != NULL)
	527	bzero(m->m_data, MLEN);
	528	return (m);
	529	}
	530
	531	/*
	532	* Returns an mbuf with an attached cluster.
	533	* Because many network drivers use this kind of buffers a lot, it is
	534	* convenient to keep a small pool of free buffers of this kind.
	535	* Even a small size such as 10 gives about 10% improvement in the
	536	* forwarding rate in a bridge or router.
	537	*/
	538	struct mbuf *
	539	m_getcl(int how, short type, int flags)
	540	{
	541	struct mbuf *m;
	542	int ocflags = MBTOM(how);
	543	int ntries = 0;
	544
	545	retryonce:
	546
	547	if (flags & M_PKTHDR)
	548	m = objcache_get(mbufphdrcluster_cache, ocflags);
	549	else
	550	m = objcache_get(mbufcluster_cache, ocflags);
	551
	552	if (m == NULL) {
	553	if ((how & MB_TRYWAIT) && ntries++ == 0) {
	554	struct objcache *reclaimlist[1];
	555
	556	if (flags & M_PKTHDR)
	557	reclaimlist[0] = mbufcluster_cache;
	558	else
	559	reclaimlist[0] = mbufphdrcluster_cache;
	560	if (!objcache_reclaimlist(reclaimlist, 1, ocflags))
	561	m_reclaim();
	562	goto retryonce;
	563	}
	564	return (NULL);
	565	}
	566
	567	m->m_type = type;
	568
	569	crit_enter();
	570	++mbtypes[type];
	571	++mbstat.m_clusters;
	572	crit_exit();
	573	return (m);
	574	}
	575
	576	/*
	577	* Allocate chain of requested length.
	578	*/
	579	struct mbuf *
	580	m_getc(int len, int how, int type)
	581	{
	582	struct mbuf n, nfirst = NULL, **ntail = &nfirst;
	583	int nsize;
	584
	585	while (len > 0) {
	586	n = m_getl(len, how, type, 0, &nsize);
	587	if (n == NULL)
	588	goto failed;
	589	n->m_len = 0;
	590	*ntail = n;
	591	ntail = &n->m_next;
	592	len -= nsize;
	593	}
	594	return (nfirst);
	595
	596	failed:
	597	m_freem(nfirst);
	598	return (NULL);
	599	}
	600
	601	/*
	602	* Allocate len-worth of mbufs and/or mbuf clusters (whatever fits best)
	603	* and return a pointer to the head of the allocated chain. If m0 is
	604	* non-null, then we assume that it is a single mbuf or an mbuf chain to
	605	* which we want len bytes worth of mbufs and/or clusters attached, and so
	606	* if we succeed in allocating it, we will just return a pointer to m0.
	607	*
	608	* If we happen to fail at any point during the allocation, we will free
	609	* up everything we have already allocated and return NULL.
	610	*
	611	* Deprecated. Use m_getc() and m_cat() instead.
	612	*/
	613	struct mbuf *
	614	m_getm(struct mbuf *m0, int len, int type, int how)
	615	{
	616	struct mbuf *nfirst;
	617
	618	nfirst = m_getc(len, how, type);
	619
	620	if (m0 != NULL) {
	621	m_last(m0)->m_next = nfirst;
	622	return (m0);
	623	}
	624
	625	return (nfirst);
	626	}
	627
	628	/*
	629	* Adds a cluster to a normal mbuf, M_EXT is set on success.
	630	* Deprecated. Use m_getcl() instead.
	631	*/
	632	void
	633	m_mclget(struct mbuf *m, int how)
	634	{
	635	struct mbcluster *mcl;
	636
	637	KKASSERT((m->m_flags & M_EXT) == 0);
	638	mcl = objcache_get(mclmeta_cache, MBTOM(how));
	639	if (mcl != NULL) {
	640	linkcluster(m, mcl);
	641	crit_enter();
	642	++mbstat.m_clusters;
	643	/* leave the m_mbufs count intact for original mbuf */
	644	crit_exit();
	645	}
	646	}
	647
	648	/*
	649	* Updates to mbcluster must be MPSAFE. Only an entity which already has
	650	* a reference to the cluster can ref it, so we are in no danger of
	651	* racing an add with a subtract. But the operation must still be atomic
	652	* since multiple entities may have a reference on the cluster.
	653	*
	654	* m_mclfree() is almost the same but it must contend with two entities
	655	* freeing the cluster at the same time. If there is only one reference
	656	* count we are the only entity referencing the cluster and no further
	657	* locking is required. Otherwise we must protect against a race to 0
	658	* with the serializer.
	659	*/
	660	static void
	661	m_mclref(void *arg)
	662	{
	663	struct mbcluster *mcl = arg;
	664
	665	atomic_add_int(&mcl->mcl_refs, 1);
	666	}
	667
	668	static void
	669	m_mclfree(void *arg)
	670	{
	671	struct mbcluster *mcl = arg;
	672
	673	if (mcl->mcl_refs == 1) {
	674	mcl->mcl_refs = 0;
	675	objcache_put(mclmeta_cache, mcl);
	676	} else {
	677	lwkt_serialize_enter(&mcl->mcl_serializer);
	678	if (mcl->mcl_refs > 1) {
	679	atomic_subtract_int(&mcl->mcl_refs, 1);
	680	lwkt_serialize_exit(&mcl->mcl_serializer);
	681	} else {
	682	lwkt_serialize_exit(&mcl->mcl_serializer);
	683	KKASSERT(mcl->mcl_refs == 1);
	684	mcl->mcl_refs = 0;
	685	objcache_put(mclmeta_cache, mcl);
	686	}
	687	}
	688	}
	689
	690	extern void db_print_backtrace(void);
	691
	692	/*
	693	* Free a single mbuf and any associated external storage. The successor,
	694	* if any, is returned.
	695	*
	696	* We do need to check non-first mbuf for m_aux, since some of existing
	697	* code does not call M_PREPEND properly.
	698	* (example: call to bpf_mtap from drivers)
	699	*/
	700	struct mbuf *
	701	m_free(struct mbuf *m)
	702	{
	703	struct mbuf *n;
	704
	705	KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m));
	706	--mbtypes[m->m_type];
	707
	708	n = m->m_next;
	709
	710	/*
	711	* Make sure the mbuf is in constructed state before returning it
	712	* to the objcache.
	713	*/
	714	m->m_next = NULL;
	715	#ifdef notyet
	716	KKASSERT(m->m_nextpkt == NULL);
	717	#else
	718	if (m->m_nextpkt != NULL) {
	719	#ifdef DDB
	720	static int afewtimes = 10;
	721
	722	if (afewtimes-- > 0) {
	723	printf("mfree: m->m_nextpkt != NULL\n");
	724	db_print_backtrace();
	725	}
	726	#endif
	727	m->m_nextpkt = NULL;
	728	}
	729	#endif
	730	if (m->m_flags & M_PKTHDR) {
	731	m_tag_delete_chain(m); /* eliminate XXX JH */
	732	}
	733
	734	m->m_flags &= (M_EXT \| M_EXT_CLUSTER \| M_CLCACHE \| M_PHCACHE);
	735
	736	/*
	737	* Clean the M_PKTHDR state so we can return the mbuf to its original
	738	* cache. This is based on the PHCACHE flag which tells us whether
	739	* the mbuf was originally allocated out of a packet-header cache
	740	* or a non-packet-header cache.
	741	*/
	742	if (m->m_flags & M_PHCACHE) {
	743	m->m_flags \|= M_PKTHDR;
	744	m->m_pkthdr.rcvif = NULL; /* eliminate XXX JH */
	745	m->m_pkthdr.csum_flags = 0; /* eliminate XXX JH */
	746	m->m_pkthdr.fw_flags = 0; /* eliminate XXX JH */
	747	SLIST_INIT(&m->m_pkthdr.tags);
	748	}
	749
	750	/*
	751	* Handle remaining flags combinations. M_CLCACHE tells us whether
	752	* the mbuf was originally allocated from a cluster cache or not,
	753	* and is totally separate from whether the mbuf is currently
	754	* associated with a cluster.
	755	*/
	756	crit_enter();
	757	switch(m->m_flags & (M_CLCACHE \| M_EXT \| M_EXT_CLUSTER)) {
	758	case M_CLCACHE \| M_EXT \| M_EXT_CLUSTER:
	759	/*
	760	* mbuf+cluster cache case. The mbuf was allocated from the
	761	* combined mbuf_cluster cache and can be returned to the
	762	* cache if the cluster hasn't been shared.
	763	*/
	764	if (m_sharecount(m) == 1) {
	765	/*
	766	* The cluster has not been shared, we can just
	767	* reset the data pointer and return the mbuf
	768	* to the cluster cache. Note that the reference
	769	* count is left intact (it is still associated with
	770	* an mbuf).
	771	*/
	772	m->m_data = m->m_ext.ext_buf;
	773	if (m->m_flags & M_PHCACHE)
	774	objcache_put(mbufphdrcluster_cache, m);
	775	else
	776	objcache_put(mbufcluster_cache, m);
	777	--mbstat.m_clusters;
	778	} else {
	779	/*
	780	* Hell. Someone else has a ref on this cluster,
	781	* we have to disconnect it which means we can't
	782	* put it back into the mbufcluster_cache, we
	783	* have to destroy the mbuf.
	784	*
	785	* Other mbuf references to the cluster will typically
	786	* be M_EXT \| M_EXT_CLUSTER but without M_CLCACHE.
	787	*
	788	* XXX we could try to connect another cluster to
	789	* it.
	790	*/
	791	m->m_ext.ext_free(m->m_ext.ext_arg);
	792	m->m_flags &= ~(M_EXT \| M_EXT_CLUSTER);
	793	if (m->m_flags & M_PHCACHE)
	794	objcache_dtor(mbufphdrcluster_cache, m);
	795	else
	796	objcache_dtor(mbufcluster_cache, m);
	797	}
	798	break;
	799	case M_EXT \| M_EXT_CLUSTER:
	800	/*
	801	* Normal cluster associated with an mbuf that was allocated
	802	* from the normal mbuf pool rather then the cluster pool.
	803	* The cluster has to be independantly disassociated from the
	804	* mbuf.
	805	*/
	806	if (m_sharecount(m) == 1)
	807	--mbstat.m_clusters;
	808	/* fall through */
	809	case M_EXT:
	810	/*
	811	* Normal cluster association case, disconnect the cluster from
	812	* the mbuf. The cluster may or may not be custom.
	813	*/
	814	m->m_ext.ext_free(m->m_ext.ext_arg);
	815	m->m_flags &= ~(M_EXT \| M_EXT_CLUSTER);
	816	/* fall through */
	817	case 0:
	818	/*
	819	* return the mbuf to the mbuf cache.
	820	*/
	821	if (m->m_flags & M_PHCACHE) {
	822	m->m_data = m->m_pktdat;
	823	objcache_put(mbufphdr_cache, m);
	824	} else {
	825	m->m_data = m->m_dat;
	826	objcache_put(mbuf_cache, m);
	827	}
	828	--mbstat.m_mbufs;
	829	break;
	830	default:
	831	if (!panicstr)
	832	panic("bad mbuf flags %p %08x\n", m, m->m_flags);
	833	break;
	834	}
	835	crit_exit();
	836	return (n);
	837	}
	838
	839	void
	840	m_freem(struct mbuf *m)
	841	{
	842	crit_enter();
	843	while (m)
	844	m = m_free(m);
	845	crit_exit();
	846	}
	847
	848	/*
	849	* mbuf utility routines
	850	*/
	851
	852	/*
	853	* Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain and
	854	* copy junk along.
	855	*/
	856	struct mbuf *
	857	m_prepend(struct mbuf *m, int len, int how)
	858	{
	859	struct mbuf *mn;
	860
	861	if (m->m_flags & M_PKTHDR)
	862	mn = m_gethdr(how, m->m_type);
	863	else
	864	mn = m_get(how, m->m_type);
	865	if (mn == NULL) {
	866	m_freem(m);
	867	return (NULL);
	868	}
	869	if (m->m_flags & M_PKTHDR)
	870	M_MOVE_PKTHDR(mn, m);
	871	mn->m_next = m;
	872	m = mn;
	873	if (len < MHLEN)
	874	MH_ALIGN(m, len);
	875	m->m_len = len;
	876	return (m);
	877	}
	878
	879	/*
	880	* Make a copy of an mbuf chain starting "off0" bytes from the beginning,
	881	* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
	882	* The wait parameter is a choice of MB_WAIT/MB_DONTWAIT from caller.
	883	* Note that the copy is read-only, because clusters are not copied,
	884	* only their reference counts are incremented.
	885	*/
	886	struct mbuf *
	887	m_copym(const struct mbuf *m, int off0, int len, int wait)
	888	{
	889	struct mbuf n, *np;
	890	int off = off0;
	891	struct mbuf *top;
	892	int copyhdr = 0;
	893
	894	KASSERT(off >= 0, ("m_copym, negative off %d", off));
	895	KASSERT(len >= 0, ("m_copym, negative len %d", len));
	896	if (off == 0 && m->m_flags & M_PKTHDR)
	897	copyhdr = 1;
	898	while (off > 0) {
	899	KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
	900	if (off < m->m_len)
	901	break;
	902	off -= m->m_len;
	903	m = m->m_next;
	904	}
	905	np = &top;
	906	top = 0;
	907	while (len > 0) {
	908	if (m == NULL) {
	909	KASSERT(len == M_COPYALL,
	910	("m_copym, length > size of mbuf chain"));
	911	break;
	912	}
	913	/*
	914	* Because we are sharing any cluster attachment below,
	915	* be sure to get an mbuf that does not have a cluster
	916	* associated with it.
	917	*/
	918	if (copyhdr)
	919	n = m_gethdr(wait, m->m_type);
	920	else
	921	n = m_get(wait, m->m_type);
	922	*np = n;
	923	if (n == NULL)
	924	goto nospace;
	925	if (copyhdr) {
	926	if (!m_dup_pkthdr(n, m, wait))
	927	goto nospace;
	928	if (len == M_COPYALL)
	929	n->m_pkthdr.len -= off0;
	930	else
	931	n->m_pkthdr.len = len;
	932	copyhdr = 0;
	933	}
	934	n->m_len = min(len, m->m_len - off);
	935	if (m->m_flags & M_EXT) {
	936	KKASSERT((n->m_flags & M_EXT) == 0);
	937	n->m_data = m->m_data + off;
	938	m->m_ext.ext_ref(m->m_ext.ext_arg);
	939	n->m_ext = m->m_ext;
	940	n->m_flags \|= m->m_flags & (M_EXT \| M_EXT_CLUSTER);
	941	} else {
	942	bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
	943	(unsigned)n->m_len);
	944	}
	945	if (len != M_COPYALL)
	946	len -= n->m_len;
	947	off = 0;
	948	m = m->m_next;
	949	np = &n->m_next;
	950	}
	951	if (top == NULL)
	952	mbstat.m_mcfail++;
	953	return (top);
	954	nospace:
	955	m_freem(top);
	956	mbstat.m_mcfail++;
	957	return (NULL);
	958	}
	959
	960	/*
	961	* Copy an entire packet, including header (which must be present).
	962	* An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
	963	* Note that the copy is read-only, because clusters are not copied,
	964	* only their reference counts are incremented.
	965	* Preserve alignment of the first mbuf so if the creator has left
	966	* some room at the beginning (e.g. for inserting protocol headers)
	967	* the copies also have the room available.
	968	*/
	969	struct mbuf *
	970	m_copypacket(struct mbuf *m, int how)
	971	{
	972	struct mbuf top, n, *o;
	973
	974	n = m_gethdr(how, m->m_type);
	975	top = n;
	976	if (!n)
	977	goto nospace;
	978
	979	if (!m_dup_pkthdr(n, m, how))
	980	goto nospace;
	981	n->m_len = m->m_len;
	982	if (m->m_flags & M_EXT) {
	983	KKASSERT((n->m_flags & M_EXT) == 0);
	984	n->m_data = m->m_data;
	985	m->m_ext.ext_ref(m->m_ext.ext_arg);
	986	n->m_ext = m->m_ext;
	987	n->m_flags \|= m->m_flags & (M_EXT \| M_EXT_CLUSTER);
	988	} else {
	989	n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
	990	bcopy(mtod(m, char ), mtod(n, char ), n->m_len);
	991	}
	992
	993	m = m->m_next;
	994	while (m) {
	995	o = m_get(how, m->m_type);
	996	if (!o)
	997	goto nospace;
	998
	999	n->m_next = o;
	1000	n = n->m_next;
	1001
	1002	n->m_len = m->m_len;
	1003	if (m->m_flags & M_EXT) {
	1004	KKASSERT((n->m_flags & M_EXT) == 0);
	1005	n->m_data = m->m_data;
	1006	m->m_ext.ext_ref(m->m_ext.ext_arg);
	1007	n->m_ext = m->m_ext;
	1008	n->m_flags \|= m->m_flags & (M_EXT \| M_EXT_CLUSTER);
	1009	} else {
	1010	bcopy(mtod(m, char ), mtod(n, char ), n->m_len);
	1011	}
	1012
	1013	m = m->m_next;
	1014	}
	1015	return top;
	1016	nospace:
	1017	m_freem(top);
	1018	mbstat.m_mcfail++;
	1019	return (NULL);
	1020	}
	1021
	1022	/*
	1023	* Copy data from an mbuf chain starting "off" bytes from the beginning,
	1024	* continuing for "len" bytes, into the indicated buffer.
	1025	*/
	1026	void
	1027	m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
	1028	{
	1029	unsigned count;
	1030
	1031	KASSERT(off >= 0, ("m_copydata, negative off %d", off));
	1032	KASSERT(len >= 0, ("m_copydata, negative len %d", len));
	1033	while (off > 0) {
	1034	KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
	1035	if (off < m->m_len)
	1036	break;
	1037	off -= m->m_len;
	1038	m = m->m_next;
	1039	}
	1040	while (len > 0) {
	1041	KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
	1042	count = min(m->m_len - off, len);
	1043	bcopy(mtod(m, caddr_t) + off, cp, count);
	1044	len -= count;
	1045	cp += count;
	1046	off = 0;
	1047	m = m->m_next;
	1048	}
	1049	}
	1050
	1051	/*
	1052	* Copy a packet header mbuf chain into a completely new chain, including
	1053	* copying any mbuf clusters. Use this instead of m_copypacket() when
	1054	* you need a writable copy of an mbuf chain.
	1055	*/
	1056	struct mbuf *
	1057	m_dup(struct mbuf *m, int how)
	1058	{
	1059	struct mbuf *p, top = NULL;
	1060	int remain, moff, nsize;
	1061
	1062	/* Sanity check */
	1063	if (m == NULL)
	1064	return (NULL);
	1065	KASSERT((m->m_flags & M_PKTHDR) != 0, ("%s: !PKTHDR", __func__));
	1066
	1067	/* While there's more data, get a new mbuf, tack it on, and fill it */
	1068	remain = m->m_pkthdr.len;
	1069	moff = 0;
	1070	p = &top;
	1071	while (remain > 0 \|\| top == NULL) { /* allow m->m_pkthdr.len == 0 */
	1072	struct mbuf *n;
	1073
	1074	/* Get the next new mbuf */
	1075	n = m_getl(remain, how, m->m_type, top == NULL ? M_PKTHDR : 0,
	1076	&nsize);
	1077	if (n == NULL)
	1078	goto nospace;
	1079	if (top == NULL)
	1080	if (!m_dup_pkthdr(n, m, how))
	1081	goto nospace0;
	1082
	1083	/* Link it into the new chain */
	1084	*p = n;
	1085	p = &n->m_next;
	1086
	1087	/* Copy data from original mbuf(s) into new mbuf */
	1088	n->m_len = 0;
	1089	while (n->m_len < nsize && m != NULL) {
	1090	int chunk = min(nsize - n->m_len, m->m_len - moff);
	1091
	1092	bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
	1093	moff += chunk;
	1094	n->m_len += chunk;
	1095	remain -= chunk;
	1096	if (moff == m->m_len) {
	1097	m = m->m_next;
	1098	moff = 0;
	1099	}
	1100	}
	1101
	1102	/* Check correct total mbuf length */
	1103	KASSERT((remain > 0 && m != NULL) \|\| (remain == 0 && m == NULL),
	1104	("%s: bogus m_pkthdr.len", __func__));
	1105	}
	1106	return (top);
	1107
	1108	nospace:
	1109	m_freem(top);
	1110	nospace0:
	1111	mbstat.m_mcfail++;
	1112	return (NULL);
	1113	}
	1114
	1115	/*
	1116	* Concatenate mbuf chain n to m.
	1117	* Both chains must be of the same type (e.g. MT_DATA).
	1118	* Any m_pkthdr is not updated.
	1119	*/
	1120	void
	1121	m_cat(struct mbuf m, struct mbuf n)
	1122	{
	1123	m = m_last(m);
	1124	while (n) {
	1125	if (m->m_flags & M_EXT \|\|
	1126	m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
	1127	/* just join the two chains */
	1128	m->m_next = n;
	1129	return;
	1130	}
	1131	/* splat the data from one into the other */
	1132	bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
	1133	(u_int)n->m_len);
	1134	m->m_len += n->m_len;
	1135	n = m_free(n);
	1136	}
	1137	}
	1138
	1139	void
	1140	m_adj(struct mbuf *mp, int req_len)
	1141	{
	1142	int len = req_len;
	1143	struct mbuf *m;
	1144	int count;
	1145
	1146	if ((m = mp) == NULL)
	1147	return;
	1148	if (len >= 0) {
	1149	/*
	1150	* Trim from head.
	1151	*/
	1152	while (m != NULL && len > 0) {
	1153	if (m->m_len <= len) {
	1154	len -= m->m_len;
	1155	m->m_len = 0;
	1156	m = m->m_next;
	1157	} else {
	1158	m->m_len -= len;
	1159	m->m_data += len;
	1160	len = 0;
	1161	}
	1162	}
	1163	m = mp;
	1164	if (mp->m_flags & M_PKTHDR)
	1165	m->m_pkthdr.len -= (req_len - len);
	1166	} else {
	1167	/*
	1168	* Trim from tail. Scan the mbuf chain,
	1169	* calculating its length and finding the last mbuf.
	1170	* If the adjustment only affects this mbuf, then just
	1171	* adjust and return. Otherwise, rescan and truncate
	1172	* after the remaining size.
	1173	*/
	1174	len = -len;
	1175	count = 0;
	1176	for (;;) {
	1177	count += m->m_len;
	1178	if (m->m_next == (struct mbuf *)0)
	1179	break;
	1180	m = m->m_next;
	1181	}
	1182	if (m->m_len >= len) {
	1183	m->m_len -= len;
	1184	if (mp->m_flags & M_PKTHDR)
	1185	mp->m_pkthdr.len -= len;
	1186	return;
	1187	}
	1188	count -= len;
	1189	if (count < 0)
	1190	count = 0;
	1191	/*
	1192	* Correct length for chain is "count".
	1193	* Find the mbuf with last data, adjust its length,
	1194	* and toss data from remaining mbufs on chain.
	1195	*/
	1196	m = mp;
	1197	if (m->m_flags & M_PKTHDR)
	1198	m->m_pkthdr.len = count;
	1199	for (; m; m = m->m_next) {
	1200	if (m->m_len >= count) {
	1201	m->m_len = count;
	1202	break;
	1203	}
	1204	count -= m->m_len;
	1205	}
	1206	while (m->m_next)
	1207	(m = m->m_next) ->m_len = 0;
	1208	}
	1209	}
	1210
	1211	/*
	1212	* Rearrange an mbuf chain so that len bytes are contiguous
	1213	* and in the data area of an mbuf (so that mtod will work for a structure
	1214	* of size len). Returns the resulting mbuf chain on success, frees it and
	1215	* returns null on failure. If there is room, it will add up to
	1216	* max_protohdr-len extra bytes to the contiguous region in an attempt to
	1217	* avoid being called next time.
	1218	*/
	1219	struct mbuf *
	1220	m_pullup(struct mbuf *n, int len)
	1221	{
	1222	struct mbuf *m;
	1223	int count;
	1224	int space;
	1225
	1226	/*
	1227	* If first mbuf has no cluster, and has room for len bytes
	1228	* without shifting current data, pullup into it,
	1229	* otherwise allocate a new mbuf to prepend to the chain.
	1230	*/
	1231	if (!(n->m_flags & M_EXT) &&
	1232	n->m_data + len < &n->m_dat[MLEN] &&
	1233	n->m_next) {
	1234	if (n->m_len >= len)
	1235	return (n);
	1236	m = n;
	1237	n = n->m_next;
	1238	len -= m->m_len;
	1239	} else {
	1240	if (len > MHLEN)
	1241	goto bad;
	1242	if (n->m_flags & M_PKTHDR)
	1243	m = m_gethdr(MB_DONTWAIT, n->m_type);
	1244	else
	1245	m = m_get(MB_DONTWAIT, n->m_type);
	1246	if (m == NULL)
	1247	goto bad;
	1248	m->m_len = 0;
	1249	if (n->m_flags & M_PKTHDR)
	1250	M_MOVE_PKTHDR(m, n);
	1251	}
	1252	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
	1253	do {
	1254	count = min(min(max(len, max_protohdr), space), n->m_len);
	1255	bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
	1256	(unsigned)count);
	1257	len -= count;
	1258	m->m_len += count;
	1259	n->m_len -= count;
	1260	space -= count;
	1261	if (n->m_len)
	1262	n->m_data += count;
	1263	else
	1264	n = m_free(n);
	1265	} while (len > 0 && n);
	1266	if (len > 0) {
	1267	m_free(m);
	1268	goto bad;
	1269	}
	1270	m->m_next = n;
	1271	return (m);
	1272	bad:
	1273	m_freem(n);
	1274	mbstat.m_mpfail++;
	1275	return (NULL);
	1276	}
	1277
	1278	/*
	1279	* Partition an mbuf chain in two pieces, returning the tail --
	1280	* all but the first len0 bytes. In case of failure, it returns NULL and
	1281	* attempts to restore the chain to its original state.
	1282	*
	1283	* Note that the resulting mbufs might be read-only, because the new
	1284	* mbuf can end up sharing an mbuf cluster with the original mbuf if
	1285	* the "breaking point" happens to lie within a cluster mbuf. Use the
	1286	* M_WRITABLE() macro to check for this case.
	1287	*/
	1288	struct mbuf *
	1289	m_split(struct mbuf *m0, int len0, int wait)
	1290	{
	1291	struct mbuf m, n;
	1292	unsigned len = len0, remain;
	1293
	1294	for (m = m0; m && len > m->m_len; m = m->m_next)
	1295	len -= m->m_len;
	1296	if (m == NULL)
	1297	return (NULL);
	1298	remain = m->m_len - len;
	1299	if (m0->m_flags & M_PKTHDR) {
	1300	n = m_gethdr(wait, m0->m_type);
	1301	if (n == NULL)
	1302	return (NULL);
	1303	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
	1304	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
	1305	m0->m_pkthdr.len = len0;
	1306	if (m->m_flags & M_EXT)
	1307	goto extpacket;
	1308	if (remain > MHLEN) {
	1309	/* m can't be the lead packet */
	1310	MH_ALIGN(n, 0);
	1311	n->m_next = m_split(m, len, wait);
	1312	if (n->m_next == NULL) {
	1313	m_free(n);
	1314	return (NULL);
	1315	} else {
	1316	n->m_len = 0;
	1317	return (n);
	1318	}
	1319	} else
	1320	MH_ALIGN(n, remain);
	1321	} else if (remain == 0) {
	1322	n = m->m_next;
	1323	m->m_next = 0;
	1324	return (n);
	1325	} else {
	1326	n = m_get(wait, m->m_type);
	1327	if (n == NULL)
	1328	return (NULL);
	1329	M_ALIGN(n, remain);
	1330	}
	1331	extpacket:
	1332	if (m->m_flags & M_EXT) {
	1333	KKASSERT((n->m_flags & M_EXT) == 0);
	1334	n->m_data = m->m_data + len;
	1335	m->m_ext.ext_ref(m->m_ext.ext_arg);
	1336	n->m_ext = m->m_ext;
	1337	n->m_flags \|= m->m_flags & (M_EXT \| M_EXT_CLUSTER);
	1338	} else {
	1339	bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
	1340	}
	1341	n->m_len = remain;
	1342	m->m_len = len;
	1343	n->m_next = m->m_next;
	1344	m->m_next = 0;
	1345	return (n);
	1346	}
	1347
	1348	/*
	1349	* Routine to copy from device local memory into mbufs.
	1350	* Note: "offset" is ill-defined and always called as 0, so ignore it.
	1351	*/
	1352	struct mbuf *
	1353	m_devget(char buf, int len, int offset, struct ifnet ifp,
	1354	void (copy)(volatile const void from, volatile void *to, size_t length))
	1355	{
	1356	struct mbuf m, mfirst = NULL, **mtail;
	1357	int nsize, flags;
	1358
	1359	if (copy == NULL)
	1360	copy = bcopy;
	1361	mtail = &mfirst;
	1362	flags = M_PKTHDR;
	1363
	1364	while (len > 0) {
	1365	m = m_getl(len, MB_DONTWAIT, MT_DATA, flags, &nsize);
	1366	if (m == NULL) {
	1367	m_freem(mfirst);
	1368	return (NULL);
	1369	}
	1370	m->m_len = min(len, nsize);
	1371
	1372	if (flags & M_PKTHDR) {
	1373	if (len + max_linkhdr <= nsize)
	1374	m->m_data += max_linkhdr;
	1375	m->m_pkthdr.rcvif = ifp;
	1376	m->m_pkthdr.len = len;
	1377	flags = 0;
	1378	}
	1379
	1380	copy(buf, m->m_data, (unsigned)m->m_len);
	1381	buf += m->m_len;
	1382	len -= m->m_len;
	1383	*mtail = m;
	1384	mtail = &m->m_next;
	1385	}
	1386
	1387	return (mfirst);
	1388	}
	1389
	1390	/*
	1391	* Copy data from a buffer back into the indicated mbuf chain,
	1392	* starting "off" bytes from the beginning, extending the mbuf
	1393	* chain if necessary.
	1394	*/
	1395	void
	1396	m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
	1397	{
	1398	int mlen;
	1399	struct mbuf m = m0, n;
	1400	int totlen = 0;
	1401
	1402	if (m0 == NULL)
	1403	return;
	1404	while (off > (mlen = m->m_len)) {
	1405	off -= mlen;
	1406	totlen += mlen;
	1407	if (m->m_next == NULL) {
	1408	n = m_getclr(MB_DONTWAIT, m->m_type);
	1409	if (n == NULL)
	1410	goto out;
	1411	n->m_len = min(MLEN, len + off);
	1412	m->m_next = n;
	1413	}
	1414	m = m->m_next;
	1415	}
	1416	while (len > 0) {
	1417	mlen = min (m->m_len - off, len);
	1418	bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
	1419	cp += mlen;
	1420	len -= mlen;
	1421	mlen += off;
	1422	off = 0;
	1423	totlen += mlen;
	1424	if (len == 0)
	1425	break;
	1426	if (m->m_next == NULL) {
	1427	n = m_get(MB_DONTWAIT, m->m_type);
	1428	if (n == NULL)
	1429	break;
	1430	n->m_len = min(MLEN, len);
	1431	m->m_next = n;
	1432	}
	1433	m = m->m_next;
	1434	}
	1435	out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
	1436	m->m_pkthdr.len = totlen;
	1437	}
	1438
	1439	void
	1440	m_print(const struct mbuf *m)
	1441	{
	1442	int len;
	1443	const struct mbuf *m2;
	1444
	1445	len = m->m_pkthdr.len;
	1446	m2 = m;
	1447	while (len) {
	1448	printf("%p %D\n", m2, m2->m_len, (u_char )m2->m_data, "-");
	1449	len -= m2->m_len;
	1450	m2 = m2->m_next;
	1451	}
	1452	return;
	1453	}
	1454
	1455	/*
	1456	* "Move" mbuf pkthdr from "from" to "to".
	1457	* "from" must have M_PKTHDR set, and "to" must be empty.
	1458	*/
	1459	void
	1460	m_move_pkthdr(struct mbuf to, struct mbuf from)
	1461	{
	1462	KASSERT((to->m_flags & M_PKTHDR), ("m_move_pkthdr: not packet header"));
	1463
	1464	to->m_flags \|= from->m_flags & M_COPYFLAGS;
	1465	to->m_pkthdr = from->m_pkthdr; /* especially tags */
	1466	SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */
	1467	}
	1468
	1469	/*
	1470	* Duplicate "from"'s mbuf pkthdr in "to".
	1471	* "from" must have M_PKTHDR set, and "to" must be empty.
	1472	* In particular, this does a deep copy of the packet tags.
	1473	*/
	1474	int
	1475	m_dup_pkthdr(struct mbuf to, const struct mbuf from, int how)
	1476	{
	1477	KASSERT((to->m_flags & M_PKTHDR), ("m_dup_pkthdr: not packet header"));
	1478
	1479	to->m_flags = (from->m_flags & M_COPYFLAGS) \|
	1480	(to->m_flags & ~M_COPYFLAGS);
	1481	to->m_pkthdr = from->m_pkthdr;
	1482	SLIST_INIT(&to->m_pkthdr.tags);
	1483	return (m_tag_copy_chain(to, from, how));
	1484	}
	1485
	1486	/*
	1487	* Defragment a mbuf chain, returning the shortest possible
	1488	* chain of mbufs and clusters. If allocation fails and
	1489	* this cannot be completed, NULL will be returned, but
	1490	* the passed in chain will be unchanged. Upon success,
	1491	* the original chain will be freed, and the new chain
	1492	* will be returned.
	1493	*
	1494	* If a non-packet header is passed in, the original
	1495	* mbuf (chain?) will be returned unharmed.
	1496	*
	1497	* m_defrag_nofree doesn't free the passed in mbuf.
	1498	*/
	1499	struct mbuf *
	1500	m_defrag(struct mbuf *m0, int how)
	1501	{
	1502	struct mbuf *m_new;
	1503
	1504	if ((m_new = m_defrag_nofree(m0, how)) == NULL)
	1505	return (NULL);
	1506	if (m_new != m0)
	1507	m_freem(m0);
	1508	return (m_new);
	1509	}
	1510
	1511	struct mbuf *
	1512	m_defrag_nofree(struct mbuf *m0, int how)
	1513	{
	1514	struct mbuf m_new = NULL, m_final = NULL;
	1515	int progress = 0, length, nsize;
	1516
	1517	if (!(m0->m_flags & M_PKTHDR))
	1518	return (m0);
	1519
	1520	#ifdef MBUF_STRESS_TEST
	1521	if (m_defragrandomfailures) {
	1522	int temp = karc4random() & 0xff;
	1523	if (temp == 0xba)
	1524	goto nospace;
	1525	}
	1526	#endif
	1527
	1528	m_final = m_getl(m0->m_pkthdr.len, how, MT_DATA, M_PKTHDR, &nsize);
	1529	if (m_final == NULL)
	1530	goto nospace;
	1531	m_final->m_len = 0; /* in case m0->m_pkthdr.len is zero */
	1532
	1533	if (m_dup_pkthdr(m_final, m0, how) == NULL)
	1534	goto nospace;
	1535
	1536	m_new = m_final;
	1537
	1538	while (progress < m0->m_pkthdr.len) {
	1539	length = m0->m_pkthdr.len - progress;
	1540	if (length > MCLBYTES)
	1541	length = MCLBYTES;
	1542
	1543	if (m_new == NULL) {
	1544	m_new = m_getl(length, how, MT_DATA, 0, &nsize);
	1545	if (m_new == NULL)
	1546	goto nospace;
	1547	}
	1548
	1549	m_copydata(m0, progress, length, mtod(m_new, caddr_t));
	1550	progress += length;
	1551	m_new->m_len = length;
	1552	if (m_new != m_final)
	1553	m_cat(m_final, m_new);
	1554	m_new = NULL;
	1555	}
	1556	if (m0->m_next == NULL)
	1557	m_defraguseless++;
	1558	m_defragpackets++;
	1559	m_defragbytes += m_final->m_pkthdr.len;
	1560	return (m_final);
	1561	nospace:
	1562	m_defragfailure++;
	1563	if (m_new)
	1564	m_free(m_new);
	1565	m_freem(m_final);
	1566	return (NULL);
	1567	}
	1568
	1569	/*
	1570	* Move data from uio into mbufs.
	1571	*/
	1572	struct mbuf *
	1573	m_uiomove(struct uio *uio)
	1574	{
	1575	struct mbuf m; / current working mbuf */
	1576	struct mbuf head = NULL; / result mbuf chain */
	1577	struct mbuf **mp = &head;
	1578	int resid = uio->uio_resid, nsize, flags = M_PKTHDR, error;
	1579
	1580	do {
	1581	m = m_getl(resid, MB_WAIT, MT_DATA, flags, &nsize);
	1582	if (flags) {
	1583	m->m_pkthdr.len = 0;
	1584	/* Leave room for protocol headers. */
	1585	if (resid < MHLEN)
	1586	MH_ALIGN(m, resid);
	1587	flags = 0;
	1588	}
	1589	m->m_len = min(nsize, resid);
	1590	error = uiomove(mtod(m, caddr_t), m->m_len, uio);
	1591	if (error) {
	1592	m_free(m);
	1593	goto failed;
	1594	}
	1595	*mp = m;
	1596	mp = &m->m_next;
	1597	head->m_pkthdr.len += m->m_len;
	1598	resid -= m->m_len;
	1599	} while (resid > 0);
	1600
	1601	return (head);
	1602
	1603	failed:
	1604	m_freem(head);
	1605	return (NULL);
	1606	}
	1607
	1608	struct mbuf *
	1609	m_last(struct mbuf *m)
	1610	{
	1611	while (m->m_next)
	1612	m = m->m_next;
	1613	return (m);
	1614	}
	1615
	1616	/*
	1617	* Return the number of bytes in an mbuf chain.
	1618	* If lastm is not NULL, also return the last mbuf.
	1619	*/
	1620	u_int
	1621	m_lengthm(struct mbuf m, struct mbuf *lastm)
	1622	{
	1623	u_int len = 0;
	1624	struct mbuf *prev = m;
	1625
	1626	while (m) {
	1627	len += m->m_len;
	1628	prev = m;
	1629	m = m->m_next;
	1630	}
	1631	if (lastm != NULL)
	1632	*lastm = prev;
	1633	return (len);
	1634	}
	1635
	1636	/*
	1637	* Like m_lengthm(), except also keep track of mbuf usage.
	1638	*/
	1639	u_int
	1640	m_countm(struct mbuf m, struct mbuf lastm, u_int pmbcnt)
	1641	{
	1642	u_int len = 0, mbcnt = 0;
	1643	struct mbuf *prev = m;
	1644
	1645	while (m) {
	1646	len += m->m_len;
	1647	mbcnt += MSIZE;
	1648	if (m->m_flags & M_EXT)
	1649	mbcnt += m->m_ext.ext_size;
	1650	prev = m;
	1651	m = m->m_next;
	1652	}
	1653	if (lastm != NULL)
	1654	*lastm = prev;
	1655	*pmbcnt = mbcnt;
	1656	return (len);
	1657	}