X-Git-Url: https://gitweb.dragonflybsd.org/~josepht/dragonfly.git/blobdiff_plain/f679d2c676683584d5870e2cb1e4bf94cf2cdabc..33dbeae810812d056db4c1a65d540b9dceeccede:/sys/kern/uipc_mbuf.c diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 0965af9093..22323f0003 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -1,4 +1,6 @@ /* + * (MPSAFE) + * * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. * Copyright (c) 2004 The DragonFly Project. All rights reserved. * @@ -72,6 +74,7 @@ #include "opt_mbuf_stress_test.h" #include #include +#include #include #include #include @@ -83,7 +86,9 @@ #include #include #include + #include +#include #include #include @@ -134,18 +139,21 @@ mbtrack_cmp(struct mbtrack *mb1, struct mbtrack *mb2) RB_GENERATE2(mbuf_rb_tree, mbtrack, rb_node, mbtrack_cmp, struct mbuf *, m); struct mbuf_rb_tree mbuf_track_root; +static struct spinlock mbuf_track_spin = SPINLOCK_INITIALIZER(mbuf_track_spin); static void mbuftrack(struct mbuf *m) { struct mbtrack *mbt; - crit_enter(); mbt = kmalloc(sizeof(*mbt), M_MTRACK, M_INTWAIT|M_ZERO); + spin_lock(&mbuf_track_spin); mbt->m = m; - if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) + if (mbuf_rb_tree_RB_INSERT(&mbuf_track_root, mbt)) { + spin_unlock(&mbuf_track_spin); panic("mbuftrack: mbuf %p already being tracked\n", m); - crit_exit(); + } + spin_unlock(&mbuf_track_spin); } static void @@ -153,15 +161,16 @@ mbufuntrack(struct mbuf *m) { struct mbtrack *mbt; - crit_enter(); + spin_lock(&mbuf_track_spin); mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m); if (mbt == NULL) { - kprintf("mbufuntrack: mbuf %p was not tracked\n", m); + spin_unlock(&mbuf_track_spin); + panic("mbufuntrack: mbuf %p was not tracked\n", m); } else { mbuf_rb_tree_RB_REMOVE(&mbuf_track_root, mbt); + spin_unlock(&mbuf_track_spin); kfree(mbt, M_MTRACK); } - crit_exit(); } void @@ -170,18 +179,21 @@ mbuftrackid(struct mbuf *m, int trackid) struct mbtrack *mbt; struct mbuf *n; - crit_enter(); + spin_lock(&mbuf_track_spin); while (m) { n = m->m_nextpkt; while (m) { mbt = mbuf_rb_tree_RB_LOOKUP(&mbuf_track_root, m); - if (mbt) - mbt->trackid = trackid; + if (mbt == NULL) { + spin_unlock(&mbuf_track_spin); + panic("mbuftrackid: mbuf %p not tracked", m); + } + mbt->trackid = trackid; m = m->m_next; } m = n; } - crit_exit(); + spin_unlock(&mbuf_track_spin); } static int @@ -193,7 +205,9 @@ mbuftrack_callback(struct mbtrack *mbt, void *arg) ksnprintf(buf, sizeof(buf), "mbuf %p track %d\n", mbt->m, mbt->trackid); + spin_unlock(&mbuf_track_spin); error = SYSCTL_OUT(req, buf, strlen(buf)); + spin_lock(&mbuf_track_spin); if (error) return(-error); return(0); @@ -204,10 +218,10 @@ mbuftrack_show(SYSCTL_HANDLER_ARGS) { int error; - crit_enter(); + spin_lock(&mbuf_track_spin); error = mbuf_rb_tree_RB_SCAN(&mbuf_track_root, NULL, mbuftrack_callback, req); - crit_exit(); + spin_unlock(&mbuf_track_spin); return (-error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, showmbufs, CTLFLAG_RD|CTLTYPE_STRING, @@ -239,25 +253,27 @@ int m_defragrandomfailures; #endif struct objcache *mbuf_cache, *mbufphdr_cache; -struct objcache *mclmeta_cache; +struct objcache *mclmeta_cache, *mjclmeta_cache; struct objcache *mbufcluster_cache, *mbufphdrcluster_cache; +struct objcache *mbufjcluster_cache, *mbufphdrjcluster_cache; int nmbclusters; int nmbufs; SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, - &max_linkhdr, 0, ""); + &max_linkhdr, 0, "Max size of a link-level header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, - &max_protohdr, 0, ""); -SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); + &max_protohdr, 0, "Max size of a protocol header"); +SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, + "Max size of link+protocol headers"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, - &max_datalen, 0, ""); + &max_datalen, 0, "Max data payload size without headers"); SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, - &mbuf_wait, 0, ""); + &mbuf_wait, 0, "Time in ticks to sleep after failed mbuf allocations"); static int do_mbstat(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT|CTLFLAG_RD, - 0, 0, do_mbstat, "S,mbstat", ""); + 0, 0, do_mbstat, "S,mbstat", "mbuf usage statistics"); static int do_mbtypes(SYSCTL_HANDLER_ARGS); @@ -329,13 +345,13 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0, "Maximum number of mbufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, - &m_defragpackets, 0, ""); + &m_defragpackets, 0, "Number of defragment packets"); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, - &m_defragbytes, 0, ""); + &m_defragbytes, 0, "Number of defragment bytes"); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, - &m_defraguseless, 0, ""); + &m_defraguseless, 0, "Number of useless defragment mbuf chain operations"); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, - &m_defragfailure, 0, ""); + &m_defragfailure, 0, "Number of failed defragment mbuf chain operations"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); @@ -343,17 +359,23 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf"); static MALLOC_DEFINE(M_MBUFCL, "mbufcl", "mbufcl"); +static MALLOC_DEFINE(M_MJBUFCL, "mbufcl", "mbufcl"); static MALLOC_DEFINE(M_MCLMETA, "mclmeta", "mclmeta"); +static MALLOC_DEFINE(M_MJCLMETA, "mjclmeta", "mjclmeta"); static void m_reclaim (void); static void m_mclref(void *arg); static void m_mclfree(void *arg); +/* + * NOTE: Default NMBUFS must take into account a possible DOS attack + * using fd passing on unix domain sockets. + */ #ifndef NMBCLUSTERS #define NMBCLUSTERS (512 + maxusers * 16) #endif #ifndef NMBUFS -#define NMBUFS (nmbclusters * 2) +#define NMBUFS (nmbclusters * 2 + maxfiles) #endif /* @@ -442,6 +464,23 @@ mclmeta_ctor(void *obj, void *private, int ocflags) return (TRUE); } +static boolean_t +mjclmeta_ctor(void *obj, void *private, int ocflags) +{ + struct mbcluster *cl = obj; + void *buf; + + if (ocflags & M_NOWAIT) + buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_NOWAIT | M_ZERO); + else + buf = kmalloc(MJUMPAGESIZE, M_MBUFCL, M_INTWAIT | M_ZERO); + if (buf == NULL) + return (FALSE); + cl->mcl_refs = 0; + cl->mcl_data = buf; + return (TRUE); +} + static void mclmeta_dtor(void *obj, void *private) { @@ -452,7 +491,7 @@ mclmeta_dtor(void *obj, void *private) } static void -linkcluster(struct mbuf *m, struct mbcluster *cl) +linkjcluster(struct mbuf *m, struct mbcluster *cl, uint size) { /* * Add the cluster to the mbuf. The caller will detect that the @@ -462,13 +501,19 @@ linkcluster(struct mbuf *m, struct mbcluster *cl) m->m_ext.ext_buf = cl->mcl_data; m->m_ext.ext_ref = m_mclref; m->m_ext.ext_free = m_mclfree; - m->m_ext.ext_size = MCLBYTES; + m->m_ext.ext_size = size; atomic_add_int(&cl->mcl_refs, 1); m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT | M_EXT_CLUSTER; } +static void +linkcluster(struct mbuf *m, struct mbcluster *cl) +{ + linkjcluster(m, cl, MCLBYTES); +} + static boolean_t mbufphdrcluster_ctor(void *obj, void *private, int ocflags) { @@ -486,6 +531,23 @@ mbufphdrcluster_ctor(void *obj, void *private, int ocflags) return (TRUE); } +static boolean_t +mbufphdrjcluster_ctor(void *obj, void *private, int ocflags) +{ + struct mbuf *m = obj; + struct mbcluster *cl; + + mbufphdr_ctor(obj, private, ocflags); + cl = objcache_get(mjclmeta_cache, ocflags); + if (cl == NULL) { + ++mbstat[mycpu->gd_cpuid].m_drops; + return (FALSE); + } + m->m_flags |= M_CLCACHE; + linkjcluster(m, cl, MJUMPAGESIZE); + return (TRUE); +} + static boolean_t mbufcluster_ctor(void *obj, void *private, int ocflags) { @@ -503,6 +565,23 @@ mbufcluster_ctor(void *obj, void *private, int ocflags) return (TRUE); } +static boolean_t +mbufjcluster_ctor(void *obj, void *private, int ocflags) +{ + struct mbuf *m = obj; + struct mbcluster *cl; + + mbuf_ctor(obj, private, ocflags); + cl = objcache_get(mjclmeta_cache, ocflags); + if (cl == NULL) { + ++mbstat[mycpu->gd_cpuid].m_drops; + return (FALSE); + } + m->m_flags |= M_CLCACHE; + linkjcluster(m, cl, MJUMPAGESIZE); + return (TRUE); +} + /* * Used for both the cluster and cluster PHDR caches. * @@ -520,7 +599,10 @@ mbufcluster_dtor(void *obj, void *private) mcl = m->m_ext.ext_arg; KKASSERT(mcl->mcl_refs == 1); mcl->mcl_refs = 0; - objcache_put(mclmeta_cache, mcl); + if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) + objcache_put(mjclmeta_cache, mcl); + else + objcache_put(mclmeta_cache, mcl); } } @@ -542,6 +624,7 @@ mbinit(void *dummy) for (i = 0; i < ncpus; i++) { atomic_set_long_nonlocked(&mbstat[i].m_msize, MSIZE); atomic_set_long_nonlocked(&mbstat[i].m_mclbytes, MCLBYTES); + atomic_set_long_nonlocked(&mbstat[i].m_mjumpagesize, MJUMPAGESIZE); atomic_set_long_nonlocked(&mbstat[i].m_minclsize, MINCLSIZE); atomic_set_long_nonlocked(&mbstat[i].m_mlen, MLEN); atomic_set_long_nonlocked(&mbstat[i].m_mhlen, MHLEN); @@ -571,6 +654,11 @@ mbinit(void *dummy) mclmeta_ctor, mclmeta_dtor, NULL, objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args); + cl_limit = nmbclusters; + mjclmeta_cache = objcache_create("jcluster mbuf", &cl_limit, 0, + mjclmeta_ctor, mclmeta_dtor, NULL, + objcache_malloc_alloc, objcache_malloc_free, &mclmeta_malloc_args); + limit = nmbclusters; mbufcluster_cache = objcache_create("mbuf + cluster", &limit, 0, mbufcluster_ctor, mbufcluster_dtor, NULL, @@ -583,6 +671,18 @@ mbinit(void *dummy) objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); mb_limit += limit; + limit = nmbclusters; + mbufjcluster_cache = objcache_create("mbuf + jcluster", &limit, 0, + mbufjcluster_ctor, mbufcluster_dtor, NULL, + objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); + mb_limit += limit; + + limit = nmbclusters; + mbufphdrjcluster_cache = objcache_create("mbuf pkt hdr + jcluster", + &limit, 64, mbufphdrjcluster_ctor, mbufcluster_dtor, NULL, + objcache_malloc_alloc, objcache_malloc_free, &mbuf_malloc_args); + mb_limit += limit; + /* * Adjust backing kmalloc pools' limit * @@ -592,7 +692,8 @@ mbinit(void *dummy) cl_limit += cl_limit / 8; kmalloc_raise_limit(mclmeta_malloc_args.mtype, mclmeta_malloc_args.objsize * cl_limit); - kmalloc_raise_limit(M_MBUFCL, MCLBYTES * cl_limit); + kmalloc_raise_limit(M_MBUFCL, MCLBYTES * cl_limit * 3/4 + MJUMPAGESIZE * cl_limit / 4); + /*kmalloc_raise_limit(M_MBUFCL, MCLBYTES * cl_limit);*/ mb_limit += mb_limit / 8; kmalloc_raise_limit(mbuf_malloc_args.mtype, @@ -638,14 +739,14 @@ m_reclaim(void) struct domain *dp; struct protosw *pr; - crit_enter(); + kprintf("Debug: m_reclaim() called\n"); + SLIST_FOREACH(dp, &domains, dom_next) { for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { if (pr->pr_drain) (*pr->pr_drain)(); } } - crit_exit(); atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_drain, 1); } @@ -653,9 +754,13 @@ static void __inline updatestats(struct mbuf *m, int type) { struct globaldata *gd = mycpu; - m->m_type = type; + m->m_type = type; mbuftrack(m); +#ifdef MBUF_DEBUG + KASSERT(m->m_next == NULL, ("mbuf %p: bad m_next in get", m)); + KASSERT(m->m_nextpkt == NULL, ("mbuf %p: bad m_nextpkt in get", m)); +#endif atomic_add_long_nonlocked(&mbtypes[gd->gd_cpuid][type], 1); atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mbufs, 1); @@ -680,9 +785,12 @@ retryonce: if ((how & MB_TRYWAIT) && ntries++ == 0) { struct objcache *reclaimlist[] = { mbufphdr_cache, - mbufcluster_cache, mbufphdrcluster_cache + mbufcluster_cache, + mbufphdrcluster_cache, + mbufjcluster_cache, + mbufphdrjcluster_cache }; - const int nreclaims = __arysize(reclaimlist); + const int nreclaims = NELEM(reclaimlist); if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf)) m_reclaim(); @@ -691,6 +799,10 @@ retryonce: ++mbstat[mycpu->gd_cpuid].m_drops; return (NULL); } +#ifdef MBUF_DEBUG + KASSERT(m->m_data == m->m_dat, ("mbuf %p: bad m_data in get", m)); +#endif + m->m_len = 0; updatestats(m, type); return (m); @@ -711,9 +823,10 @@ retryonce: if ((how & MB_TRYWAIT) && ntries++ == 0) { struct objcache *reclaimlist[] = { mbuf_cache, - mbufcluster_cache, mbufphdrcluster_cache + mbufcluster_cache, mbufphdrcluster_cache, + mbufjcluster_cache, mbufphdrjcluster_cache }; - const int nreclaims = __arysize(reclaimlist); + const int nreclaims = NELEM(reclaimlist); if (!objcache_reclaimlist(reclaimlist, nreclaims, ocf)) m_reclaim(); @@ -722,6 +835,11 @@ retryonce: ++mbstat[mycpu->gd_cpuid].m_drops; return (NULL); } +#ifdef MBUF_DEBUG + KASSERT(m->m_data == m->m_pktdat, ("mbuf %p: bad m_data in get", m)); +#endif + m->m_len = 0; + m->m_pkthdr.len = 0; updatestats(m, type); return (m); @@ -742,6 +860,51 @@ m_getclr(int how, int type) return (m); } +struct mbuf * +m_getjcl(int how, short type, int flags, size_t size) +{ + struct mbuf *m = NULL; + int ocflags = MBTOM(how); + int ntries = 0; + +retryonce: + + if (flags & M_PKTHDR) + m = objcache_get(mbufphdrjcluster_cache, ocflags); + else + m = objcache_get(mbufjcluster_cache, ocflags); + + if (m == NULL) { + if ((how & MB_TRYWAIT) && ntries++ == 0) { + struct objcache *reclaimlist[1]; + + if (flags & M_PKTHDR) + reclaimlist[0] = mbufjcluster_cache; + else + reclaimlist[0] = mbufphdrjcluster_cache; + if (!objcache_reclaimlist(reclaimlist, 1, ocflags)) + m_reclaim(); + goto retryonce; + } + ++mbstat[mycpu->gd_cpuid].m_drops; + return (NULL); + } + +#ifdef MBUF_DEBUG + KASSERT(m->m_data == m->m_ext.ext_buf, + ("mbuf %p: bad m_data in get", m)); +#endif + m->m_type = type; + m->m_len = 0; + m->m_pkthdr.len = 0; /* just do it unconditonally */ + + mbuftrack(m); + + atomic_add_long_nonlocked(&mbtypes[mycpu->gd_cpuid][type], 1); + atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1); + return (m); +} + /* * Returns an mbuf with an attached cluster. * Because many network drivers use this kind of buffers a lot, it is @@ -779,7 +942,13 @@ retryonce: return (NULL); } +#ifdef MBUF_DEBUG + KASSERT(m->m_data == m->m_ext.ext_buf, + ("mbuf %p: bad m_data in get", m)); +#endif m->m_type = type; + m->m_len = 0; + m->m_pkthdr.len = 0; /* just do it unconditonally */ mbuftrack(m); @@ -853,7 +1022,8 @@ m_mclget(struct mbuf *m, int how) mcl = objcache_get(mclmeta_cache, MBTOM(how)); if (mcl != NULL) { linkcluster(m, mcl); - atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1); + atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, + 1); } else { ++mbstat[mycpu->gd_cpuid].m_drops; } @@ -866,10 +1036,7 @@ m_mclget(struct mbuf *m, int how) * since multiple entities may have a reference on the cluster. * * m_mclfree() is almost the same but it must contend with two entities - * freeing the cluster at the same time. If there is only one reference - * count we are the only entity referencing the cluster and no further - * locking is required. Otherwise we must protect against a race to 0 - * with the serializer. + * freeing the cluster at the same time. */ static void m_mclref(void *arg) @@ -901,13 +1068,24 @@ m_mclfree(void *arg) * code does not call M_PREPEND properly. * (example: call to bpf_mtap from drivers) */ + +#ifdef MBUF_DEBUG + +struct mbuf * +_m_free(struct mbuf *m, const char *func) + +#else + struct mbuf * m_free(struct mbuf *m) + +#endif { struct mbuf *n; struct globaldata *gd = mycpu; KASSERT(m->m_type != MT_FREE, ("freeing free mbuf %p", m)); + KASSERT(M_TRAILINGSPACE(m) >= 0, ("overflowed mbuf %p", m)); atomic_subtract_long_nonlocked(&mbtypes[gd->gd_cpuid][m->m_type], 1); n = m->m_next; @@ -918,6 +1096,9 @@ m_free(struct mbuf *m) */ m->m_next = NULL; mbufuntrack(m); +#ifdef MBUF_DEBUG + m->m_hdr.mh_lastfunc = func; +#endif #ifdef notyet KKASSERT(m->m_nextpkt == NULL); #else @@ -926,7 +1107,7 @@ m_free(struct mbuf *m) if (afewtimes-- > 0) { kprintf("mfree: m->m_nextpkt != NULL\n"); - print_backtrace(); + print_backtrace(-1); } m->m_nextpkt = NULL; } @@ -957,7 +1138,6 @@ m_free(struct mbuf *m) * and is totally separate from whether the mbuf is currently * associated with a cluster. */ - crit_enter(); switch(m->m_flags & (M_CLCACHE | M_EXT | M_EXT_CLUSTER)) { case M_CLCACHE | M_EXT | M_EXT_CLUSTER: /* @@ -974,10 +1154,17 @@ m_free(struct mbuf *m) * an mbuf). */ m->m_data = m->m_ext.ext_buf; - if (m->m_flags & M_PHCACHE) - objcache_put(mbufphdrcluster_cache, m); - else - objcache_put(mbufcluster_cache, m); + if (m->m_flags & M_EXT && m->m_ext.ext_size != MCLBYTES) { + if (m->m_flags & M_PHCACHE) + objcache_put(mbufphdrjcluster_cache, m); + else + objcache_put(mbufjcluster_cache, m); + } else { + if (m->m_flags & M_PHCACHE) + objcache_put(mbufphdrcluster_cache, m); + else + objcache_put(mbufcluster_cache, m); + } atomic_subtract_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_clusters, 1); } else { /* @@ -992,12 +1179,20 @@ m_free(struct mbuf *m) * XXX we could try to connect another cluster to * it. */ + m->m_ext.ext_free(m->m_ext.ext_arg); m->m_flags &= ~(M_EXT | M_EXT_CLUSTER); - if (m->m_flags & M_PHCACHE) - objcache_dtor(mbufphdrcluster_cache, m); - else - objcache_dtor(mbufcluster_cache, m); + if (m->m_ext.ext_size == MCLBYTES) { + if (m->m_flags & M_PHCACHE) + objcache_dtor(mbufphdrcluster_cache, m); + else + objcache_dtor(mbufcluster_cache, m); + } else { + if (m->m_flags & M_PHCACHE) + objcache_dtor(mbufphdrjcluster_cache, m); + else + objcache_dtor(mbufjcluster_cache, m); + } } break; case M_EXT | M_EXT_CLUSTER: @@ -1036,19 +1231,29 @@ m_free(struct mbuf *m) panic("bad mbuf flags %p %08x\n", m, m->m_flags); break; } - crit_exit(); return (n); } +#ifdef MBUF_DEBUG + +void +_m_freem(struct mbuf *m, const char *func) +{ + while (m) + m = _m_free(m, func); +} + +#else + void m_freem(struct mbuf *m) { - crit_enter(); while (m) m = m_free(m); - crit_exit(); } +#endif + /* * mbuf utility routines */ @@ -1097,7 +1302,7 @@ m_copym(const struct mbuf *m, int off0, int len, int wait) KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); - if (off == 0 && m->m_flags & M_PKTHDR) + if (off == 0 && (m->m_flags & M_PKTHDR)) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); @@ -1107,7 +1312,7 @@ m_copym(const struct mbuf *m, int off0, int len, int wait) m = m->m_next; } np = ⊤ - top = 0; + top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, @@ -1316,6 +1521,83 @@ nospace0: return (NULL); } +/* + * Copy the non-packet mbuf data chain into a new set of mbufs, including + * copying any mbuf clusters. This is typically used to realign a data + * chain by nfs_realign(). + * + * The original chain is left intact. how should be MB_WAIT or MB_DONTWAIT + * and NULL can be returned if MB_DONTWAIT is passed. + * + * Be careful to use cluster mbufs, a large mbuf chain converted to non + * cluster mbufs can exhaust our supply of mbufs. + */ +struct mbuf * +m_dup_data(struct mbuf *m, int how) +{ + struct mbuf **p, *n, *top = NULL; + int mlen, moff, chunk, gsize, nsize; + + /* + * Degenerate case + */ + if (m == NULL) + return (NULL); + + /* + * Optimize the mbuf allocation but do not get too carried away. + */ + if (m->m_next || m->m_len > MLEN) + if (m->m_flags & M_EXT && m->m_ext.ext_size == MCLBYTES) + gsize = MCLBYTES; + else + gsize = MJUMPAGESIZE; + else + gsize = MLEN; + + /* Chain control */ + p = ⊤ + n = NULL; + nsize = 0; + + /* + * Scan the mbuf chain until nothing is left, the new mbuf chain + * will be allocated on the fly as needed. + */ + while (m) { + mlen = m->m_len; + moff = 0; + + while (mlen) { + KKASSERT(m->m_type == MT_DATA); + if (n == NULL) { + n = m_getl(gsize, how, MT_DATA, 0, &nsize); + n->m_len = 0; + if (n == NULL) + goto nospace; + *p = n; + p = &n->m_next; + } + chunk = imin(mlen, nsize); + bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); + mlen -= chunk; + moff += chunk; + n->m_len += chunk; + nsize -= chunk; + if (nsize == 0) + n = NULL; + } + m = m->m_next; + } + *p = NULL; + return(top); +nospace: + *p = NULL; + m_freem(top); + atomic_add_long_nonlocked(&mbstat[mycpu->gd_cpuid].m_mcfail, 1); + return (NULL); +} + /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). @@ -1707,6 +1989,52 @@ out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } +/* + * Append the specified data to the indicated mbuf chain, + * Extend the mbuf chain if the new data does not fit in + * existing space. + * + * Return 1 if able to complete the job; otherwise 0. + */ +int +m_append(struct mbuf *m0, int len, c_caddr_t cp) +{ + struct mbuf *m, *n; + int remainder, space; + + for (m = m0; m->m_next != NULL; m = m->m_next) + ; + remainder = len; + space = M_TRAILINGSPACE(m); + if (space > 0) { + /* + * Copy into available space. + */ + if (space > remainder) + space = remainder; + bcopy(cp, mtod(m, caddr_t) + m->m_len, space); + m->m_len += space; + cp += space, remainder -= space; + } + while (remainder > 0) { + /* + * Allocate a new mbuf; could check space + * and allocate a cluster instead. + */ + n = m_get(MB_DONTWAIT, m->m_type); + if (n == NULL) + break; + n->m_len = min(MLEN, remainder); + bcopy(cp, mtod(n, caddr_t), n->m_len); + cp += n->m_len, remainder -= n->m_len; + m->m_next = n; + m = n; + } + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += len - remainder; + return (remainder == 0); +} + /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. @@ -1898,93 +2226,6 @@ nospace: return (NULL); } - - -/* - * Defragment an mbuf chain, returning at most maxfrags separate - * mbufs+clusters. If this is not possible NULL is returned and - * the original mbuf chain is left in it's present (potentially - * modified) state. We use two techniques: collapsing consecutive - * mbufs and replacing consecutive mbufs by a cluster. - * - * NB: this should really be named m_defrag but that name is taken - */ -struct mbuf * -m_collapse(struct mbuf *m0, int how, int maxfrags) -{ - struct mbuf *m, *n, *n2, **prev; - u_int curfrags; - - /* - * Calculate the current number of frags. - */ - curfrags = 0; - for (m = m0; m != NULL; m = m->m_next) - curfrags++; - /* - * First, try to collapse mbufs. Note that we always collapse - * towards the front so we don't need to deal with moving the - * pkthdr. This may be suboptimal if the first mbuf has much - * less data than the following. - */ - m = m0; -again: - for (;;) { - n = m->m_next; - if (n == NULL) - break; - if (n->m_len < M_TRAILINGSPACE(m)) { - bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, - n->m_len); - m->m_len += n->m_len; - m->m_next = n->m_next; - m_free(n); - if (--curfrags <= maxfrags) - return m0; - } else - m = n; - } - KASSERT(maxfrags > 1, - ("maxfrags %u, but normal collapse failed", maxfrags)); - /* - * Collapse consecutive mbufs to a cluster. - */ - prev = &m0->m_next; /* NB: not the first mbuf */ - while ((n = *prev) != NULL) { - if ((n2 = n->m_next) != NULL && - n->m_len + n2->m_len < MCLBYTES) { - m = m_getcl(how, MT_DATA, 0); - if (m == NULL) - goto bad; - bcopy(mtod(n, void *), mtod(m, void *), n->m_len); - bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, - n2->m_len); - m->m_len = n->m_len + n2->m_len; - m->m_next = n2->m_next; - *prev = m; - m_free(n); - m_free(n2); - if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ - return m0; - /* - * Still not there, try the normal collapse - * again before we allocate another cluster. - */ - goto again; - } - prev = &n->m_next; - } - /* - * No place where we can collapse to a cluster; punt. - * This can occur if, for example, you request 2 frags - * but the packet requires that both be clusters (we - * never reallocate the first mbuf to avoid moving the - * packet header). - */ -bad: - return NULL; -} - /* * Move data from uio into mbufs. */