netlink: use protocol specific receive buffer

Implement Netlink socket receive buffer as a simple TAILQ of nl_buf's, same part of struct sockbuf that is used for send buffer already. This shaves a lot of code and a lot of extra processing. The pcb rids of the I/O queues as the socket buffer is exactly the queue. The message writer is simplified a lot, as we now always deal with linear buf. Notion of different buffer types goes away as way as different kinds of writers. The only things remaining are: a socket writer and a group writer. The impact on the network stack is that we no longer use mbufs, so a workaround from d187154750 disappears. Note on message throttling. Now the taskqueue throttling mechanism needs to look at both socket buffers protected by their respective locks and on flags in the pcb that are protected by the pcb lock. There is definitely some room for optimization, but this changes tries to preserve as much as possible. Note on new nl_soreceive(). It emulates soreceive_generic(). It must undergo further optimization, see large comment put in there. Note on tests/sys/netlink/test_netlink_message_writer.py. This test boiled down almost to nothing with mbufs removed. However, I left it with minimal functionality (it basically checks that allocating N bytes we get N bytes) as it is one of not so many examples of ktest framework that allows to test KPIs with python. Note on Linux support. It got much simplier: Netlink message writer loses notion of Linux support lifetime, it is same regardless of process ABI. On socket write from Linux process we perform conversion immediately in nl_receive_message() and on an output conversion to Linux happens in in nl_send_one(). XXX: both conversions use M_NOWAIT allocation, which used to be the case before this change, too. Reviewed by: melifaro Differential Revision: https://reviews.freebsd.org/D42524
2024-10-20 02:38:43 +00:00 · 2024-01-02 13:04:01 -08:00 · 2024-01-02 13:04:01 -08:00 · 17083b94a9
commit 17083b94a9
parent 0ad011ecec
13 changed files with 416 additions and 1067 deletions
--- a/sys/compat/linux/linux_netlink.c
+++ b/sys/compat/linux/linux_netlink.c
@ -32,7 +32,6 @@
 #include <sys/ck.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
-#include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>

@ -44,6 +43,7 @@
 #include <netlink/netlink.h>
 #include <netlink/netlink_ctl.h>
 #include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
 #include <netlink/netlink_route.h>

 #include <compat/linux/linux.h>
@ -187,6 +187,7 @@ handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw)

 	if (out_hdr != NULL) {
 		memcpy(out_hdr, hdr, hdr->nlmsg_len);
+		nw->num_messages++;
 		return (true);
 	}
 	return (false);
@ -518,8 +519,7 @@ nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *
 }

 static bool
-nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
-    struct nl_writer *nw)
+nlmsg_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw)
 {
 	if (hdr->nlmsg_type < NLMSG_MIN_TYPE) {
 		switch (hdr->nlmsg_type) {
@ -536,7 +536,7 @@ nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
 		}
 	}

-	switch (netlink_family) {
+	switch (nlp->nl_proto) {
 	case NETLINK_ROUTE:
 		return (rtnl_to_linux(hdr, nlp, nw));
 	default:
@ -544,64 +544,49 @@ nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
 	}
 }

-static struct mbuf *
-nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp)
+static bool
+nlmsgs_to_linux(struct nl_writer *nw, struct nlpcb *nlp)
 {
-	RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length);
-	struct nl_writer nw = {};
+	struct nl_buf *nb, *orig;
+	u_int offset, msglen, orig_messages __diagused;

-	struct mbuf *m = NULL;
-	if (!nlmsg_get_chain_writer(&nw, data_length, &m)) {
-		RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d",
-		    data_length);
-		return (NULL);
-	}
+	RT_LOG(LOG_DEBUG3, "%p: in %u bytes %u messages", __func__,
+	    nw->buf->datalen, nw->num_messages);
+
+	orig = nw->buf;
+	nb = nl_buf_alloc(orig->datalen + SCRATCH_BUFFER_SIZE, M_NOWAIT);
+	if (__predict_false(nb == NULL))
+		return (false);
+	nw->buf = nb;
+#ifdef INVARIANTS
+	orig_messages = nw->num_messages;
+#endif
+	nw->num_messages = 0;

 	/* Assume correct headers. Buffer IS mutable */
-	int count = 0;
-	for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
-		struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset];
-		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
-		count++;
+	for (offset = 0;
+	    offset + sizeof(struct nlmsghdr) <= orig->datalen;
+	    offset += msglen) {
+		struct nlmsghdr *hdr = (struct nlmsghdr *)&orig->data[offset];

-		if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) {
+		msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+		if (!nlmsg_to_linux(hdr, nlp, nw)) {
 			RT_LOG(LOG_DEBUG, "failed to process msg type %d",
 			    hdr->nlmsg_type);
-			m_freem(m);
-			return (NULL);
+			nl_buf_free(nb);
+			return (false);
 		}
-		offset += msglen;
 	}
-	nlmsg_flush(&nw);
-	RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count,
-	    m ? m_length(m, NULL) : 0);

-	return (m);
-}
+	MPASS(nw->num_messages == orig_messages);
+	MPASS(nw->buf == nb);
+	nl_buf_free(orig);
+	RT_LOG(LOG_DEBUG3, "%p: out %u bytes", __func__, offset);

-static struct mbuf *
-mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp)
-{
-	/* XXX: easiest solution, not optimized for performance */
-	int data_length = m_length(m, NULL);
-	char *buf = malloc(data_length, M_LINUX, M_NOWAIT);
-	if (buf == NULL) {
-		RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message",
-		    data_length);
-		m_freem(m);
-		return (NULL);
-	}
-	m_copydata(m, 0, data_length, buf);
-	m_freem(m);
-
-	m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp);
-	free(buf, M_LINUX);
-
-	return (m);
+	return (true);
 }

 static struct linux_netlink_provider linux_netlink_v1 = {
-	.mbufs_to_linux = mbufs_to_linux,
 	.msgs_to_linux = nlmsgs_to_linux,
 	.msg_from_linux = nlmsg_from_linux,
 };
--- a/sys/netlink/ktest_netlink_message_writer.c
+++ b/sys/netlink/ktest_netlink_message_writer.c
@ -29,9 +29,9 @@
 #include <sys/cdefs.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/mbuf.h>
 #include <netlink/netlink.h>
 #include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
 #include <netlink/netlink_message_writer.h>

 #define KTEST_CALLER
@ -39,54 +39,47 @@

 #ifdef INVARIANTS

-struct test_mbuf_attrs {
+struct test_nlbuf_attrs {
 	uint32_t	size;
 	uint32_t	expected_avail;
-	uint32_t	expected_count;
-	uint32_t	wtype;
 	int		waitok;
 };

-#define	_OUT(_field)	offsetof(struct test_mbuf_attrs, _field)
-static const struct nlattr_parser nla_p_mbuf_w[] = {
+#define	_OUT(_field)	offsetof(struct test_nlbuf_attrs, _field)
+static const struct nlattr_parser nla_p_nlbuf_w[] = {
 	{ .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 },
 	{ .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 },
-	{ .type = 3, .off = _OUT(expected_count), .cb = nlattr_get_uint32 },
-	{ .type = 4, .off = _OUT(wtype), .cb = nlattr_get_uint32 },
-	{ .type = 5, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
+	{ .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
 };
 #undef _OUT
-NL_DECLARE_ATTR_PARSER(mbuf_w_parser, nla_p_mbuf_w);
+NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w);

 static int
-test_mbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla)
+test_nlbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla)
 {
-	struct test_mbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs));
+	struct test_nlbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs));

 	ctx->arg = attrs;
 	if (attrs != NULL)
-		return (nl_parse_nested(nla, &mbuf_w_parser, ctx->npt, attrs));
+		return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs));
 	return (ENOMEM);
 }

 static int
-test_mbuf_writer_allocation(struct ktest_test_context *ctx)
+test_nlbuf_writer_allocation(struct ktest_test_context *ctx)
 {
-	struct test_mbuf_attrs *attrs = ctx->arg;
-	bool ret;
+	struct test_nlbuf_attrs *attrs = ctx->arg;
 	struct nl_writer nw = {};
+	u_int alloc_len;
+	bool ret;

-	ret = nlmsg_get_buf_type_wrapper(&nw, attrs->size, attrs->wtype, attrs->waitok);
+	ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok);
 	if (!ret)
 		return (EINVAL);

-	int alloc_len = nw.alloc_len;
+	alloc_len = nw.buf->buflen;
 	KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len);

-	/* Set cleanup callback */
-	nw.writer_target = NS_WRITER_TARGET_SOCKET;
-	nlmsg_set_callback_wrapper(&nw);
-
 	/* Mark enomem to avoid reallocation */
 	nw.enomem = true;

@ -95,9 +88,7 @@ test_mbuf_writer_allocation(struct ktest_test_context *ctx)
 		return (EINVAL);
 	}

-	/* Mark as empty to free the storage */
-	nw.offset = 0;
-	nlmsg_flush(&nw);
+	nl_buf_free(nw.buf);

 	if (alloc_len < attrs->expected_avail) {
 		KTEST_LOG(ctx, "alloc_len %d, expected %u",
@ -107,60 +98,15 @@ test_mbuf_writer_allocation(struct ktest_test_context *ctx)

 	return (0);
 }
-
-static int
-test_mbuf_chain_allocation(struct ktest_test_context *ctx)
-{
-	struct test_mbuf_attrs *attrs = ctx->arg;
-	int mflags = attrs->waitok ? M_WAITOK : M_NOWAIT;
-	struct mbuf *chain = nl_get_mbuf_chain_wrapper(attrs->size, mflags);
-
-	if (chain == NULL) {
-		KTEST_LOG(ctx, "nl_get_mbuf_chain(%u) returned NULL", attrs->size);
-		return (EINVAL);
-	}
-
-	/* Iterate and check number of mbufs and space */
-	uint32_t allocated_count = 0, allocated_size = 0;
-	for (struct mbuf *m = chain; m != NULL; m = m->m_next) {
-		allocated_count++;
-		allocated_size += M_SIZE(m);
-	}
-	m_freem(chain);
-
-	if (attrs->expected_avail > allocated_size) {
-		KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
-				" expected/allocated count %u/%u",
-		    attrs->expected_avail, allocated_size,
-		    attrs->expected_count, allocated_count);
-		return (EINVAL);
-	}
-
-	if (attrs->expected_count > 0 && (attrs->expected_count != allocated_count)) {
-		KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
-				" expected/allocated count %u/%u",
-		    attrs->expected_avail, allocated_size,
-		    attrs->expected_count, allocated_count);
-		return (EINVAL);
-	}
-
-	return (0);
-}
 #endif

 static const struct ktest_test_info tests[] = {
 #ifdef INVARIANTS
 	{
-		.name = "test_mbuf_writer_allocation",
-		.desc = "test different mbuf sizes in the mbuf writer",
-		.func = &test_mbuf_writer_allocation,
-		.parse = &test_mbuf_parser,
-	},
-	{
-		.name = "test_mbuf_chain_allocation",
-		.desc = "verify allocation different chain sizes",
-		.func = &test_mbuf_chain_allocation,
-		.parse = &test_mbuf_parser,
+		.name = "test_nlbuf_writer_allocation",
+		.desc = "test different buffer sizes in the netlink writer",
+		.func = &test_nlbuf_writer_allocation,
+		.parse = &test_nlbuf_parser,
 	},
 #endif
 };
--- a/sys/netlink/ktest_netlink_message_writer.h
+++ b/sys/netlink/ktest_netlink_message_writer.h
@ -30,28 +30,14 @@

 #if defined(_KERNEL) && defined(INVARIANTS)

-bool nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok);
-void nlmsg_set_callback_wrapper(struct nl_writer *nw);
-struct mbuf *nl_get_mbuf_chain_wrapper(int len, int malloc_flags);
+bool nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok);

 #ifndef KTEST_CALLER

 bool
-nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok)
+nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok)
 {
-	return (nlmsg_get_buf_type(nw, size, type, waitok));
-}
-
-void
-nlmsg_set_callback_wrapper(struct nl_writer *nw)
-{
-	nlmsg_set_callback(nw);
-}
-
-struct mbuf *
-nl_get_mbuf_chain_wrapper(int len, int malloc_flags)
-{
-	return (nl_get_mbuf_chain(len, malloc_flags));
+	return (nlmsg_get_buf(nw, size, waitok));
 }
 #endif

--- a/sys/netlink/netlink_domain.c
+++ b/sys/netlink/netlink_domain.c
@ -179,53 +179,76 @@ nl_get_groups_compat(struct nlpcb *nlp)
 }

 static void
-nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages,
-    int io_flags)
+nl_send_one_group(struct nl_writer *nw, struct nl_buf *nb, struct nlpcb *nlp)
 {
 	if (__predict_false(nlp->nl_flags & NLF_MSG_INFO))
-		nl_add_msg_info(m);
-	nl_send_one(m, nlp, num_messages, io_flags);
+		nl_add_msg_info(nb);
+	nw->buf = nb;
+	(void)nl_send_one(nw);
+}
+
+static struct nl_buf *
+nl_buf_copy(struct nl_buf *nb)
+{
+	struct nl_buf *copy;
+
+	copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
+	if (__predict_false(copy == NULL))
+		return (NULL);
+	memcpy(copy, nb, sizeof(*nb) + nb->buflen);
+	if (nb->control != NULL) {
+		copy->control = m_copym(nb->control, 0, M_COPYALL, M_NOWAIT);
+		if (__predict_false(copy->control == NULL)) {
+			nl_buf_free(copy);
+			return (NULL);
+		}
+	}
+
+	return (copy);
 }

 /*
- * Broadcasts message @m to the protocol @proto group specified by @group_id
+ * Broadcasts in the writer's buffer.
 */
-void
-nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
+bool
+nl_send_group(struct nl_writer *nw)
 {
+	struct nl_buf *nb = nw->buf;
 	struct nlpcb *nlp_last = NULL;
 	struct nlpcb *nlp;
 	NLCTL_TRACKER;

 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
-		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
-		NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
-		    m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
+		struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
+		NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
+		    nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
+		    nw->group.proto, nw->group.id);
 	}

+	nw->buf = NULL;
+
 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
 	if (__predict_false(ctl == NULL)) {
 		/*
 		 * Can be the case when notification is sent within VNET
 		 * which doesn't have any netlink sockets.
 		 */
-		m_freem(m);
-		return;
+		nl_buf_free(nb);
+		return (false);
 	}

 	NLCTL_RLOCK(ctl);

-	int io_flags = NL_IOF_UNTRANSLATED;
-
 	CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
-		if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) {
+		if (nl_isset_group_locked(nlp, nw->group.id) &&
+		    nlp->nl_proto == nw->group.proto) {
 			if (nlp_last != NULL) {
-				struct mbuf *m_copy;
-				m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-				if (m_copy != NULL)
-					nl_send_one_group(m_copy, nlp_last,
-					    num_messages, io_flags);
-				else {
+				struct nl_buf *copy;
+
+				copy = nl_buf_copy(nb);
+				if (copy != NULL) {
+					nl_send_one_group(nw, copy, nlp_last);
+				} else {
 					NLP_LOCK(nlp_last);
 					if (nlp_last->nl_socket != NULL)
 						sorwakeup(nlp_last->nl_socket);
@ -236,11 +259,13 @@ nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
 		}
 	}
 	if (nlp_last != NULL)
-		nl_send_one_group(m, nlp_last, num_messages, io_flags);
+		nl_send_one_group(nw, nb, nlp_last);
 	else
-		m_freem(m);
+		nl_buf_free(nb);

 	NLCTL_RUNLOCK(ctl);
+
+	return (true);
 }

 bool
@ -331,7 +356,7 @@ nl_pru_attach(struct socket *so, int proto, struct thread *td)
 		free(nlp, M_PCB);
 		return (error);
 	}
-	so->so_rcv.sb_mtx = &so->so_rcv_mtx;
+	TAILQ_INIT(&so->so_rcv.nl_queue);
 	TAILQ_INIT(&so->so_snd.nl_queue);
 	so->so_pcb = nlp;
 	nlp->nl_socket = so;
@ -344,7 +369,6 @@ nl_pru_attach(struct socket *so, int proto, struct thread *td)
 	nlp->nl_need_thread_setup = true;
 	NLP_LOCK_INIT(nlp);
 	refcount_init(&nlp->nl_refcount, 1);
-	nl_init_io(nlp);

 	nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
 	    taskqueue_thread_enqueue, &nlp->nl_taskqueue);
@ -467,15 +491,6 @@ nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
 	return (0);
 }

-static void
-destroy_nlpcb(struct nlpcb *nlp)
-{
-	NLP_LOCK(nlp);
-	nl_free_io(nlp);
-	NLP_LOCK_DESTROY(nlp);
-	free(nlp, M_PCB);
-}
-
 static void
 destroy_nlpcb_epoch(epoch_context_t ctx)
 {
@ -483,10 +498,10 @@ destroy_nlpcb_epoch(epoch_context_t ctx)

 	nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);

-	destroy_nlpcb(nlp);
+	NLP_LOCK_DESTROY(nlp);
+	free(nlp, M_PCB);
 }

-
 static void
 nl_close(struct socket *so)
 {
@ -522,9 +537,12 @@ nl_close(struct socket *so)

 	while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
 		TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
-		free(nb, M_NETLINK);
+		nl_buf_free(nb);
+	}
+	while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
+		TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
+		nl_buf_free(nb);
 	}
-	sbdestroy(so, SO_RCV);

 	NL_LOG(LOG_DEBUG3, "socket %p, detached", so);

@ -597,10 +615,8 @@ nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
 	if (nlp->nl_linux)
 		len += roundup2(uio->uio_resid, 8);
-	nb = malloc(sizeof(*nb) + len, M_NETLINK, M_WAITOK);
+	nb = nl_buf_alloc(len, M_WAITOK);
 	nb->datalen = uio->uio_resid;
-	nb->buflen = len;
-	nb->offset = 0;
 	error = uiomove(&nb->data[0], uio->uio_resid, uio);
 	if (__predict_false(error))
 		goto out;
@ -635,19 +651,107 @@ nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,

 out:
 	SOCK_IO_SEND_UNLOCK(so);
-	free(nb, M_NETLINK);
+	if (nb != NULL)
+		nl_buf_free(nb);
 	return (error);
 }

 static int
-nl_pru_rcvd(struct socket *so, int flags)
+nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp, struct mbuf **controlp, int *flagsp)
 {
+	static const struct sockaddr_nl nl_empty_src = {
+		.nl_len = sizeof(struct sockaddr_nl),
+		.nl_family = PF_NETLINK,
+		.nl_pid = 0 /* comes from the kernel */
+	};
+	struct sockbuf *sb = &so->so_rcv;
+	struct nl_buf *nb;
+	int flags, error;
+	u_int overflow;
+	bool nonblock, trunc, peek;
+
+	MPASS(mp == NULL && uio != NULL);
+
 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
-	MPASS(sotonlpcb(so) != NULL);
+
+	if (psa != NULL)
+		*psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
+		    M_WAITOK);
+
+	flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
+	trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
+	nonblock = (so->so_state & SS_NBIO) ||
+	    (flags & (MSG_DONTWAIT | MSG_NBIO));
+	peek = flags & MSG_PEEK;
+
+	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+	if (__predict_false(error))
+		return (error);
+
+	SOCK_RECVBUF_LOCK(so);
+	while ((nb = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
+		if (nonblock) {
+			SOCK_RECVBUF_UNLOCK(so);
+			SOCK_IO_RECV_UNLOCK(so);
+			return (EWOULDBLOCK);
+		}
+		error = sbwait(so, SO_RCV);
+		if (error) {
+			SOCK_RECVBUF_UNLOCK(so);
+			SOCK_IO_RECV_UNLOCK(so);
+			return (error);
+		}
+	}
+
+	/*
+	 * XXXGL
+	 * Here we emulate a PR_ATOMIC behavior of soreceive_generic() where
+	 * we take only the first "record" in the socket buffer and send it
+	 * to uio whole or truncated ignoring how many netlink messages are
+	 * in the record and how much space is left in the uio.
+	 * This needs to be fixed at next refactoring. First, we should perform
+	 * truncation only if the very first message doesn't fit into uio.
+	 * That will help an application with small buffer not to lose data.
+	 * Second, we should continue working on the sb->nl_queue as long as
+	 * there is more space in the uio.  That will boost applications with
+	 * large buffers.
+	 */
+	if (__predict_true(!peek)) {
+		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
+		sb->sb_acc -= nb->datalen;
+		sb->sb_ccc -= nb->datalen;
+	}
+	SOCK_RECVBUF_UNLOCK(so);
+
+	overflow = __predict_false(nb->datalen > uio->uio_resid) ?
+	    nb->datalen - uio->uio_resid : 0;
+	error = uiomove(nb->data, (int)nb->datalen, uio);
+	if (__predict_false(overflow > 0)) {
+		flags |= MSG_TRUNC;
+		if (trunc)
+			uio->uio_resid -= overflow;
+	}
+
+	if (controlp != NULL) {
+		*controlp = nb->control;
+		nb->control = NULL;
+	}
+
+	if (__predict_true(!peek))
+		nl_buf_free(nb);
+
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+
+	if (flagsp != NULL)
+		*flagsp |= flags;
+
+	SOCK_IO_RECV_UNLOCK(so);

 	nl_on_transmit(sotonlpcb(so));

-	return (0);
+	return (error);
 }

 static int
@ -798,8 +902,7 @@ nl_setsbopt(struct socket *so, struct sockopt *sopt)
 }

 #define	NETLINK_PROTOSW						\
-	.pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD |		\
-	    PR_SOCKBUF,						\
+	.pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF,		\
 	.pr_ctloutput = nl_ctloutput,				\
 	.pr_setsbopt = nl_setsbopt,				\
 	.pr_attach = nl_pru_attach,				\
@ -807,7 +910,7 @@ nl_setsbopt(struct socket *so, struct sockopt *sopt)
 	.pr_connect = nl_pru_connect,				\
 	.pr_disconnect = nl_pru_disconnect,			\
 	.pr_sosend = nl_sosend,					\
-	.pr_rcvd = nl_pru_rcvd,					\
+	.pr_soreceive = nl_soreceive,				\
 	.pr_shutdown = nl_pru_shutdown,				\
 	.pr_sockaddr = nl_sockaddr,				\
 	.pr_close = nl_close
--- a/sys/netlink/netlink_glue.c
+++ b/sys/netlink/netlink_glue.c
@ -111,7 +111,6 @@ static bool
 get_stub_writer(struct nl_writer *nw)
 {
 	bzero(nw, sizeof(*nw));
-	nw->writer_type = NS_WRITER_TYPE_STUB;
 	nw->enomem = true;

 	return (false);
--- a/sys/netlink/netlink_io.c
+++ b/sys/netlink/netlink_io.c
@ -51,69 +51,36 @@ _DECLARE_DEBUG(LOG_INFO);
 * sending netlink data between the kernel and userland.
 */

-static const struct sockaddr_nl _nl_empty_src = {
-	.nl_len = sizeof(struct sockaddr_nl),
-	.nl_family = PF_NETLINK,
-	.nl_pid = 0 /* comes from the kernel */
-};
-static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
-
 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);

-static void
-queue_push(struct nl_io_queue *q, struct mbuf *mq)
+struct nl_buf *
+nl_buf_alloc(size_t len, int mflag)
 {
-	while (mq != NULL) {
-		struct mbuf *m = mq;
-		mq = mq->m_nextpkt;
-		m->m_nextpkt = NULL;
+	struct nl_buf *nb;

-		q->length += m_length(m, NULL);
-		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
+	nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
+	if (__predict_true(nb != NULL)) {
+		nb->buflen = len;
+		nb->datalen = nb->offset = 0;
+		nb->control = NULL;
 	}
-}

-static struct mbuf *
-queue_pop(struct nl_io_queue *q)
-{
-	if (!STAILQ_EMPTY(&q->head)) {
-		struct mbuf *m = STAILQ_FIRST(&q->head);
-		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
-		m->m_nextpkt = NULL;
-		q->length -= m_length(m, NULL);
-
-		return (m);
-	}
-	return (NULL);
-}
-
-static struct mbuf *
-queue_head(const struct nl_io_queue *q)
-{
-	return (STAILQ_FIRST(&q->head));
-}
-
-static inline bool
-queue_empty(const struct nl_io_queue *q)
-{
-	return (q->length == 0);
-}
-
-static void
-queue_free(struct nl_io_queue *q)
-{
-	while (!STAILQ_EMPTY(&q->head)) {
-		struct mbuf *m = STAILQ_FIRST(&q->head);
-		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
-		m->m_nextpkt = NULL;
-		m_freem(m);
-	}
-	q->length = 0;
+	return (nb);
 }

 void
-nl_add_msg_info(struct mbuf *m)
+nl_buf_free(struct nl_buf *nb)
 {
+
+	if (nb->control)
+		m_freem(nb->control);
+	free(nb, M_NETLINK);
+}
+
+void
+nl_add_msg_info(struct nl_buf *nb)
+{
+	/* XXXGL pass nlp as arg? */
 	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
 	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
 	    curthread, nlp);
@ -139,27 +106,15 @@ nl_add_msg_info(struct mbuf *m)
 	};


-	while (m->m_next != NULL)
-		m = m->m_next;
-	m->m_next = sbcreatecontrol(data, sizeof(data),
+	nb->control = sbcreatecontrol(data, sizeof(data),
 	    NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);

-	NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p",
-	    (unsigned)sizeof(data), m->m_next);
-}
-
-static __noinline struct mbuf *
-extract_msg_info(struct mbuf *m)
-{
-	while (m->m_next != NULL) {
-		if (m->m_next->m_type == MT_CONTROL) {
-			struct mbuf *ctl = m->m_next;
-			m->m_next = NULL;
-			return (ctl);
-		}
-		m = m->m_next;
-	}
-	return (NULL);
+	if (__predict_true(nb->control != NULL))
+		NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p",
+		    (unsigned)sizeof(data), nb->control);
+	else
+		NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control",
+		    (unsigned)sizeof(data));
 }

 void
@ -174,65 +129,31 @@ nl_schedule_taskqueue(struct nlpcb *nlp)
 	}
 }

-static bool
-tx_check_locked(struct nlpcb *nlp)
-{
-	if (queue_empty(&nlp->tx_queue))
-		return (true);
-
-	/*
-	 * Check if something can be moved from the internal TX queue
-	 * to the socket queue.
-	 */
-
-	bool appended = false;
-	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
-	SOCKBUF_LOCK(sb);
-
-	while (true) {
-		struct mbuf *m = queue_head(&nlp->tx_queue);
-		if (m != NULL) {
-			struct mbuf *ctl = NULL;
-			if (__predict_false(m->m_next != NULL))
-				ctl = extract_msg_info(m);
-			if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) {
-				/* appended successfully */
-				queue_pop(&nlp->tx_queue);
-				appended = true;
-			} else
-				break;
-		} else
-			break;
-	}
-
-	SOCKBUF_UNLOCK(sb);
-
-	if (appended)
-		sorwakeup(nlp->nl_socket);
-
-	return (queue_empty(&nlp->tx_queue));
-}
-
 static bool
 nl_process_received_one(struct nlpcb *nlp)
 {
 	struct socket *so = nlp->nl_socket;
-	struct sockbuf *sb = &so->so_snd;
+	struct sockbuf *sb;
 	struct nl_buf *nb;
 	bool reschedule = false;

 	NLP_LOCK(nlp);
 	nlp->nl_task_pending = false;
-
-	if (!tx_check_locked(nlp)) {
-		/* TX overflow queue still not empty, ignore RX */
-		NLP_UNLOCK(nlp);
-		return (false);
-	}
-
-	int prev_hiwat = nlp->tx_queue.hiwat;
 	NLP_UNLOCK(nlp);

+	/*
+	 * Do not process queued up requests if there is no space to queue
+	 * replies.
+	 */
+	sb = &so->so_rcv;
+	SOCK_RECVBUF_LOCK(so);
+	if (sb->sb_hiwat <= sb->sb_ccc) {
+		SOCK_RECVBUF_UNLOCK(so);
+		return (false);
+	}
+	SOCK_RECVBUF_UNLOCK(so);
+
+	sb = &so->so_snd;
 	SOCK_SENDBUF_LOCK(so);
 	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
 		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
@ -244,7 +165,7 @@ nl_process_received_one(struct nlpcb *nlp)
 			sb->sb_ccc -= nb->datalen;
 			/* XXXGL: potentially can reduce lock&unlock count. */
 			sowwakeup_locked(so);
-			free(nb, M_NETLINK);
+			nl_buf_free(nb);
 			SOCK_SENDBUF_LOCK(so);
 		} else {
 			TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
@ -252,10 +173,6 @@ nl_process_received_one(struct nlpcb *nlp)
 		}
 	}
 	SOCK_SENDBUF_UNLOCK(so);
-	if (nlp->tx_queue.hiwat > prev_hiwat) {
-		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
-
-	}

 	return (reschedule);
 }
@ -276,18 +193,6 @@ nl_process_received(struct nlpcb *nlp)
 		;
 }

-void
-nl_init_io(struct nlpcb *nlp)
-{
-	STAILQ_INIT(&nlp->tx_queue.head);
-}
-
-void
-nl_free_io(struct nlpcb *nlp)
-{
-	queue_free(&nlp->tx_queue);
-}
-
 /*
 * Called after some data have been read from the socket.
 */
@ -306,8 +211,8 @@ nl_on_transmit(struct nlpcb *nlp)
 		struct sockbuf *sb = &so->so_rcv;
 		NLP_LOG(LOG_DEBUG, nlp,
 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
-		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
-		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
+		    "bytes: [%u/%u]", dropped_messages, dropped_bytes,
+		    sb->sb_ccc, sb->sb_hiwat);
 		/* TODO: send netlink message */
 	}

@ -325,95 +230,67 @@ nl_taskqueue_handler(void *_arg, int pending)
 	CURVNET_RESTORE();
 }

-static __noinline void
-queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
-{
-	queue_push(&nlp->tx_queue, m);
-	nlp->nl_tx_blocked = true;
-
-	if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
-		nlp->tx_queue.hiwat = nlp->tx_queue.length;
-}
-
 /*
- * Tries to send @m to the socket @nlp.
- *
- * @m: mbuf(s) to send to. Consumed in any case.
- * @nlp: socket to send to
- * @cnt: number of messages in @m
- * @io_flags: combination of NL_IOF_* flags
+ * Tries to send current data buffer from writer.
 *
 * Returns true on success.
 * If no queue overrunes happened, wakes up socket owner.
 */
 bool
-nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
+nl_send_one(struct nl_writer *nw)
 {
-	bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
-	bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
-	bool result = true;
+	struct nlpcb *nlp = nw->nlp;
+	struct socket *so = nlp->nl_socket;
+	struct sockbuf *sb = &so->so_rcv;
+	struct nl_buf *nb;
+
+	MPASS(nw->hdr == NULL);

 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
-		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+		struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
 		NLP_LOG(LOG_DEBUG2, nlp,
-		    "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
-		    m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
-		    io_flags);
+		    "TX len %u msgs %u msg type %d first hdrlen %u",
+		    nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
+		    hdr->nlmsg_len);
 	}

-	if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
-		m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
-		if (m == NULL)
-			return (false);
-	}
-
-	NLP_LOCK(nlp);
-
-	if (__predict_false(nlp->nl_socket == NULL)) {
-		NLP_UNLOCK(nlp);
-		m_freem(m);
+	if (nlp->nl_linux && linux_netlink_p != NULL &&
+	    __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
+		nl_buf_free(nw->buf);
+		nw->buf = NULL;
 		return (false);
 	}

-	if (!queue_empty(&nlp->tx_queue)) {
-		if (ignore_limits) {
-			queue_push_tx(nlp, m);
-		} else {
-			m_free(m);
-			result = false;
-		}
+	nb = nw->buf;
+	nw->buf = NULL;
+
+	SOCK_RECVBUF_LOCK(so);
+	if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
+		SOCK_RECVBUF_UNLOCK(so);
+		NLP_LOCK(nlp);
+		nlp->nl_dropped_bytes += nb->datalen;
+		nlp->nl_dropped_messages += nw->num_messages;
+		NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
+		    (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
+		    (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
 		NLP_UNLOCK(nlp);
-		return (result);
-	}
-
-	struct socket *so = nlp->nl_socket;
-	struct mbuf *ctl = NULL;
-	if (__predict_false(m->m_next != NULL))
-		ctl = extract_msg_info(m);
-	if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) {
-		sorwakeup(so);
-		NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
+		nl_buf_free(nb);
+		return (false);
 	} else {
-		if (ignore_limits) {
-			queue_push_tx(nlp, m);
-		} else {
-			/*
-			 * Store dropped data so it can be reported
-			 * on the next read
-			 */
-			nlp->nl_dropped_bytes += m_length(m, NULL);
-			nlp->nl_dropped_messages += num_messages;
-			NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
-			    (unsigned long)nlp->nl_dropped_messages, num_messages,
-			    (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
-			soroverflow(so);
-			m_freem(m);
-			result = false;
-		}
-	}
-	NLP_UNLOCK(nlp);
+		bool full;

-	return (result);
+		TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
+		sb->sb_acc += nb->datalen;
+		sb->sb_ccc += nb->datalen;
+		full = sb->sb_hiwat <= sb->sb_ccc;
+		sorwakeup_locked(so);
+		if (full) {
+			NLP_LOCK(nlp);
+			nlp->nl_tx_blocked = true;
+			NLP_UNLOCK(nlp);
+		}
+		return (true);
+	}
 }

 static int
--- a/sys/netlink/netlink_linux.h
+++ b/sys/netlink/netlink_linux.h
@ -27,6 +27,7 @@

 #ifndef _NETLINK_LINUX_VAR_H_
 #define _NETLINK_LINUX_VAR_H_
+#ifdef _KERNEL

 /*
 * The file contains headers for the bridge interface between
@ -34,16 +35,13 @@
 */
 struct nlpcb;
 struct nl_pstate;
+struct nl_writer;

-typedef struct mbuf *mbufs_to_linux_cb_t(int netlink_family, struct mbuf *m,
-    struct nlpcb *nlp);
-typedef struct mbuf *msgs_to_linux_cb_t(int netlink_family, char *buf, int data_length,
-    struct nlpcb *nlp);
+typedef bool msgs_to_linux_cb_t(struct nl_writer *nw, struct nlpcb *nlp);
 typedef struct nlmsghdr *msg_from_linux_cb_t(int netlink_family, struct nlmsghdr *hdr,
    struct nl_pstate *npt);

 struct linux_netlink_provider {
-	mbufs_to_linux_cb_t	*mbufs_to_linux;
 	msgs_to_linux_cb_t	*msgs_to_linux;
 	msg_from_linux_cb_t	*msg_from_linux;

@ -52,3 +50,4 @@ struct linux_netlink_provider {
 extern struct linux_netlink_provider *linux_netlink_p;

 #endif
+#endif
--- a/sys/netlink/netlink_message_writer.c
+++ b/sys/netlink/netlink_message_writer.c
@ -30,7 +30,6 @@
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/mbuf.h>
-#include <sys/ck.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
@ -45,523 +44,44 @@
 #include <netlink/netlink_debug.h>
 _DECLARE_DEBUG(LOG_INFO);

-/*
- * The goal of this file is to provide convenient message writing KPI on top of
- * different storage methods (mbufs, uio, temporary memory chunks).
- *
- * The main KPI guarantee is that the (last) message always resides in the contiguous
- *  memory buffer, so one is able to update the header after writing the entire message.
- *
- * This guarantee comes with a side effect of potentially reallocating underlying
- *  buffer, so one needs to update the desired pointers after something is added
- *  to the header.
- *
- * Messaging layer contains hooks performing transparent Linux translation for the messages.
- *
- * There are 3 types of supported targets:
- *  * socket (adds mbufs to the socket buffer, used for message replies)
- *  * group (sends mbuf/chain to the specified groups, used for the notifications)
- *  * chain (returns mbuf chain, used in Linux message translation code)
- *
- * There are 3 types of storage:
- * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
- *    fits in NLMBUFSIZE)
- * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
- *    to be larger than one supported by NS_WRITER_TYPE_MBUF)
- * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
- *    Linux sockets, calls translation hook prior to sending messages to the socket).
- *
- * Internally, KPI switches between different types of storage when memory requirements
- *  change. It happens transparently to the caller.
- */
-
-/*
- * Uma zone for the mbuf-based Netlink storage
- */
-static uma_zone_t	nlmsg_zone;
-
-static void
-nl_free_mbuf_storage(struct mbuf *m)
-{
-	uma_zfree(nlmsg_zone, m->m_ext.ext_buf);
-}
-
-static int
-nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused)
-{
-	struct mbuf *m = (struct mbuf *)arg;
-
-	if (m != NULL)
-		m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE);
-
-	return (0);
-}
-
-static struct mbuf *
-nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags)
-{
-	struct mbuf *m, *m_storage;
-
-	if (size <= MHLEN)
-		return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags));
-
-	if (__predict_false(size > NLMBUFSIZE))
-		return (NULL);
-
-	m = m_gethdr(malloc_flags, MT_DATA);
-	if (m == NULL)
-		return (NULL);
-
-	m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags);
-	if (m_storage == NULL) {
-		m_free_raw(m);
-		return (NULL);
-	}
-
-	return (m);
-}
-
-static struct mbuf *
-nl_get_mbuf(int size, int malloc_flags)
-{
-	return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR));
-}
-
-/*
- * Gets a chain of Netlink mbufs.
- * This is strip-down version of m_getm2()
- */
-static struct mbuf *
-nl_get_mbuf_chain(int len, int malloc_flags)
-{
-	struct mbuf *m_chain = NULL, *m_tail = NULL;
-	int mbuf_flags = M_PKTHDR;
-
-	while (len > 0) {
-		int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len;
-		struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags);
-
-		if (m == NULL) {
-			m_freem(m_chain);
-			return (NULL);
-		}
-
-		/* Book keeping. */
-		len -= M_SIZE(m);
-		if (m_tail != NULL)
-			m_tail->m_next = m;
-		else
-			m_chain = m;
-		m_tail = m;
-		mbuf_flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
-	}
-
-	return (m_chain);
-}
-
-void
-nl_init_msg_zone(void)
-{
-	nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage,
-	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-}
-
-void
-nl_destroy_msg_zone(void)
-{
-	uma_zdestroy(nlmsg_zone);
-}
-
-
-typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
-typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
-
-struct nlwriter_ops {
-	nlwriter_op_init	*init;
-	nlwriter_op_write	*write_socket;
-	nlwriter_op_write	*write_group;
-	nlwriter_op_write	*write_chain;
-};
-
-/*
- * NS_WRITER_TYPE_BUF
- * Writes message to a temporary memory buffer,
- * flushing to the socket/group when buffer size limit is reached
- */
 static bool
-nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
+nlmsg_get_buf(struct nl_writer *nw, u_int len, bool waitok)
 {
-	int mflag = waitok ? M_WAITOK : M_NOWAIT;
-	nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
-	if (__predict_false(nw->_storage == NULL))
+	const int mflag = waitok ? M_WAITOK : M_NOWAIT;
+
+	MPASS(nw->buf == NULL);
+
+	NL_LOG(LOG_DEBUG3, "Setting up nw %p len %u %s", nw, len,
+	    waitok ? "wait" : "nowait");
+
+	nw->buf = nl_buf_alloc(len, mflag);
+	if (__predict_false(nw->buf == NULL))
 		return (false);
-	nw->alloc_len = size;
-	nw->offset = 0;
 	nw->hdr = NULL;
-	nw->data = nw->_storage;
-	nw->writer_type = NS_WRITER_TYPE_BUF;
 	nw->malloc_flag = mflag;
 	nw->num_messages = 0;
 	nw->enomem = false;
-	return (true);
-}
-
-static bool
-nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
-	if (__predict_false(datalen == 0)) {
-		free(buf, M_NETLINK);
-		return (true);
-	}
-
-	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
-	if (__predict_false(m == NULL)) {
-		/* XXX: should we set sorcverr? */
-		free(buf, M_NETLINK);
-		return (false);
-	}
-	m_append(m, datalen, buf);
-	free(buf, M_NETLINK);
-
-	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
-	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
-}
-
-static bool
-nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
-	    nw->arg.group.proto, nw->arg.group.id);
-	if (__predict_false(datalen == 0)) {
-		free(buf, M_NETLINK);
-		return (true);
-	}
-
-	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
-	if (__predict_false(m == NULL)) {
-		free(buf, M_NETLINK);
-		return (false);
-	}
-	bool success = m_append(m, datalen, buf) != 0;
-	free(buf, M_NETLINK);
-
-	if (!success)
-		return (false);
-
-	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
-	return (true);
-}
-
-static bool
-nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
-
-	if (__predict_false(datalen == 0)) {
-		free(buf, M_NETLINK);
-		return (true);
-	}
-
-	if (*m0 == NULL) {
-		struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
-
-		if (__predict_false(m == NULL)) {
-			free(buf, M_NETLINK);
-			return (false);
-		}
-		*m0 = m;
-	}
-	if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
-		free(buf, M_NETLINK);
-		return (false);
-	}
-	return (true);
-}
-
-
-/*
- * NS_WRITER_TYPE_MBUF
- * Writes message to the allocated mbuf,
- * flushing to socket/group when mbuf size limit is reached.
- * This is the most efficient mechanism as it avoids double-copying.
- *
- * Allocates a single mbuf suitable to store up to @size bytes of data.
- * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr.
- * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone.
- * Returns NULL on greater size or the allocation failure.
- */
-static bool
-nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
-{
-	int mflag = waitok ? M_WAITOK : M_NOWAIT;
-	struct mbuf *m = nl_get_mbuf(size, mflag);
-
-	if (__predict_false(m == NULL))
-		return (false);
-	nw->alloc_len = M_TRAILINGSPACE(m);
-	nw->offset = 0;
-	nw->hdr = NULL;
-	nw->_storage = (void *)m;
-	nw->data = mtod(m, void *);
-	nw->writer_type = NS_WRITER_TYPE_MBUF;
-	nw->malloc_flag = mflag;
-	nw->num_messages = 0;
-	nw->enomem = false;
-	memset(nw->data, 0, size);
-	NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
-	    m, size, nw->alloc_len, nw->data);
-	return (true);
-}
-
-static bool
-nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct mbuf *m = (struct mbuf *)buf;
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
-
-	if (__predict_false(datalen == 0)) {
-		m_freem(m);
-		return (true);
-	}
-
-	m->m_pkthdr.len = datalen;
-	m->m_len = datalen;
-	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
-	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
-}
-
-static bool
-nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct mbuf *m = (struct mbuf *)buf;
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
-	    nw->arg.group.proto, nw->arg.group.id);
-
-	if (__predict_false(datalen == 0)) {
-		m_freem(m);
-		return (true);
-	}
-
-	m->m_pkthdr.len = datalen;
-	m->m_len = datalen;
-	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
-	return (true);
-}
-
-static bool
-nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct mbuf *m_new = (struct mbuf *)buf;
-	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
-
-	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
-
-	if (__predict_false(datalen == 0)) {
-		m_freem(m_new);
-		return (true);
-	}
-
-	m_new->m_pkthdr.len = datalen;
-	m_new->m_len = datalen;
-
-	if (*m0 == NULL) {
-		*m0 = m_new;
-	} else {
-		struct mbuf *m_last;
-		for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
-			;
-		m_last->m_next = m_new;
-		(*m0)->m_pkthdr.len += datalen;
-	}

 	return (true);
 }

-/*
- * NS_WRITER_TYPE_LBUF
- * Writes message to the allocated memory buffer,
- * flushing to socket/group when mbuf size limit is reached.
- * Calls linux handler to rewrite messages before sending to the socket.
- */
-static bool
-nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
-{
-	int mflag = waitok ? M_WAITOK : M_NOWAIT;
-	size = roundup2(size, sizeof(void *));
-	int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
-	char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
-	if (__predict_false(buf == NULL))
-		return (false);
-
-	/* Fill buffer header first */
-	struct linear_buffer *lb = (struct linear_buffer *)buf;
-	lb->base = &buf[sizeof(struct linear_buffer) + size];
-	lb->size = size + SCRATCH_BUFFER_SIZE;
-
-	nw->alloc_len = size;
-	nw->offset = 0;
-	nw->hdr = NULL;
-	nw->_storage = buf;
-	nw->data = (char *)(lb + 1);
-	nw->malloc_flag = mflag;
-	nw->writer_type = NS_WRITER_TYPE_LBUF;
-	nw->num_messages = 0;
-	nw->enomem = false;
-	return (true);
-}
-
-static bool
-nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct linear_buffer *lb = (struct linear_buffer *)buf;
-	char *data = (char *)(lb + 1);
-	struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr);
-
-	if (__predict_false(datalen == 0)) {
-		free(buf, M_NETLINK);
-		return (true);
-	}
-
-	struct mbuf *m = NULL;
-	if (linux_netlink_p != NULL)
-		m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
-	free(buf, M_NETLINK);
-
-	if (__predict_false(m == NULL)) {
-		/* XXX: should we set sorcverr? */
-		return (false);
-	}
-
-	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
-	return (nl_send_one(m, nlp, cnt, io_flags));
-}
-
-/* Shouldn't be called (maybe except Linux code originating message) */
-static bool
-nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
-{
-	struct linear_buffer *lb = (struct linear_buffer *)buf;
-	char *data = (char *)(lb + 1);
-
-	if (__predict_false(datalen == 0)) {
-		free(buf, M_NETLINK);
-		return (true);
-	}
-
-	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
-	if (__predict_false(m == NULL)) {
-		free(buf, M_NETLINK);
-		return (false);
-	}
-	m_append(m, datalen, data);
-	free(buf, M_NETLINK);
-
-	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
-	return (true);
-}
-
-static const struct nlwriter_ops nlmsg_writers[] = {
-	/* NS_WRITER_TYPE_MBUF */
-	{
-		.init = nlmsg_get_ns_mbuf,
-		.write_socket = nlmsg_write_socket_mbuf,
-		.write_group = nlmsg_write_group_mbuf,
-		.write_chain = nlmsg_write_chain_mbuf,
-	},
-	/* NS_WRITER_TYPE_BUF */
-	{
-		.init = nlmsg_get_ns_buf,
-		.write_socket = nlmsg_write_socket_buf,
-		.write_group = nlmsg_write_group_buf,
-		.write_chain = nlmsg_write_chain_buf,
-	},
-	/* NS_WRITER_TYPE_LBUF */
-	{
-		.init = nlmsg_get_ns_lbuf,
-		.write_socket = nlmsg_write_socket_lbuf,
-		.write_group = nlmsg_write_group_lbuf,
-	},
-};
-
-static void
-nlmsg_set_callback(struct nl_writer *nw)
-{
-	const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
-
-	switch (nw->writer_target) {
-	case NS_WRITER_TARGET_SOCKET:
-		nw->cb = pops->write_socket;
-		break;
-	case NS_WRITER_TARGET_GROUP:
-		nw->cb = pops->write_group;
-		break;
-	case NS_WRITER_TARGET_CHAIN:
-		nw->cb = pops->write_chain;
-		break;
-	default:
-		panic("not implemented");
-	}
-}
-
-static bool
-nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
-{
-	MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
-	NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
-	return (nlmsg_writers[type].init(nw, size, waitok));
-}
-
-static bool
-nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
-{
-	int type;
-
-	if (!is_linux) {
-		if (__predict_true(size <= NLMBUFSIZE))
-			type = NS_WRITER_TYPE_MBUF;
-		else
-			type = NS_WRITER_TYPE_BUF;
-	} else
-		type = NS_WRITER_TYPE_LBUF;
-	return (nlmsg_get_buf_type(nw, size, type, waitok));
-}
-
 bool
 _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
 {
-	if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
-		return (false);
-	nw->arg.ptr = (void *)nlp;
-	nw->writer_target = NS_WRITER_TARGET_SOCKET;
-	nlmsg_set_callback(nw);
-	return (true);
+	nw->nlp = nlp;
+	nw->cb = nl_send_one;
+
+	return (nlmsg_get_buf(nw, size, false));
 }

 bool
 _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
 {
-	if (!nlmsg_get_buf(nw, size, false, false))
-		return (false);
-	nw->arg.group.proto = protocol;
-	nw->arg.group.id = group_id;
-	nw->writer_target = NS_WRITER_TARGET_GROUP;
-	nlmsg_set_callback(nw);
-	return (true);
-}
+	nw->group.proto = protocol;
+	nw->group.id = group_id;
+	nw->cb = nl_send_group;

-bool
-_nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
-{
-	if (!nlmsg_get_buf(nw, size, false, false))
-		return (false);
-	*pm = NULL;
-	nw->arg.ptr = (void *)pm;
-	nw->writer_target = NS_WRITER_TARGET_CHAIN;
-	nlmsg_set_callback(nw);
-	NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
-	return (true);
+	return (nlmsg_get_buf(nw, size, false));
 }

 void
@ -576,18 +96,18 @@ _nlmsg_flush(struct nl_writer *nw)

 	if (__predict_false(nw->hdr != NULL)) {
 		/* Last message has not been completed, skip it. */
-		int completed_len = (char *)nw->hdr - nw->data;
+		int completed_len = (char *)nw->hdr - nw->buf->data;
 		/* Send completed messages */
-		nw->offset -= nw->offset - completed_len;
+		nw->buf->datalen -= nw->buf->datalen - completed_len;
 		nw->hdr = NULL;
-	}
+        }

 	NL_LOG(LOG_DEBUG2, "OUT");
-	bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
-	nw->_storage = NULL;
+	bool result = nw->cb(nw);
+	nw->num_messages = 0;

 	if (!result) {
-		NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
+		NL_LOG(LOG_DEBUG, "nw %p flush with %p() failed", nw, nw->cb);
 	}

 	return (result);
@ -599,59 +119,61 @@ _nlmsg_flush(struct nl_writer *nw)
 * Return true on success.
 */
 bool
-_nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
+_nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len)
 {
-	struct nl_writer ns_new = {};
-	int completed_len, new_len;
+	struct nl_buf *new;
+	u_int completed_len, new_len, last_len;
+
+	MPASS(nw->buf != NULL);

 	if (nw->enomem)
 		return (false);

-	NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
-	    nw->offset, nw->alloc_len, required_len);
+	NL_LOG(LOG_DEBUG3, "no space at offset %u/%u (want %u), trying to "
+	    "reclaim", nw->buf->datalen, nw->buf->buflen, required_len);

-	/* Calculated new buffer size and allocate it s*/
-	completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
+	/* Calculate new buffer size and allocate it. */
+	completed_len = (nw->hdr != NULL) ?
+	    (char *)nw->hdr - nw->buf->data : nw->buf->datalen;
 	if (completed_len > 0 && required_len < NLMBUFSIZE) {
-		/* We already ran out of space, use the largest effective size */
-		new_len = max(nw->alloc_len, NLMBUFSIZE);
+		/* We already ran out of space, use largest effective size. */
+		new_len = max(nw->buf->buflen, NLMBUFSIZE);
 	} else {
-		if (nw->alloc_len < NLMBUFSIZE)
+		if (nw->buf->buflen < NLMBUFSIZE)
+			/* XXXGL: does this happen? */
 			new_len = NLMBUFSIZE;
 		else
-			new_len = nw->alloc_len * 2;
+			new_len = nw->buf->buflen * 2;
 		while (new_len < required_len)
 			new_len *= 2;
 	}
-	bool waitok = (nw->malloc_flag == M_WAITOK);
-	bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
-	if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
+
+	new = nl_buf_alloc(new_len, nw->malloc_flag | M_ZERO);
+	if (__predict_false(new == NULL)) {
 		nw->enomem = true;
 		NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
 		return (false);
 	}
-	if (nw->ignore_limit)
-		nlmsg_ignore_limit(&ns_new);

-	/* Update callback data */
-	ns_new.writer_target = nw->writer_target;
-	nlmsg_set_callback(&ns_new);
-	ns_new.arg = nw->arg;
-
-	/* Copy last (unfinished) header to the new storage */
-	int last_len = nw->offset - completed_len;
+	/* Copy last (unfinished) header to the new storage. */
+	last_len = nw->buf->datalen - completed_len;
 	if (last_len > 0) {
-		memcpy(ns_new.data, nw->hdr, last_len);
-		ns_new.hdr = (struct nlmsghdr *)ns_new.data;
-		ns_new.offset = last_len;
+		memcpy(new->data, nw->hdr, last_len);
+		new->datalen = last_len;
 	}

-	NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
+	NL_LOG(LOG_DEBUG2, "completed: %u bytes, copied: %u bytes",
+	    completed_len, last_len);

-	/* Flush completed headers & switch to the new nw */
-	nlmsg_flush(nw);
-	memcpy(nw, &ns_new, sizeof(struct nl_writer));
-	NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
+	if (completed_len > 0) {
+		nlmsg_flush(nw);
+		MPASS(nw->buf == NULL);
+	} else
+		nl_buf_free(nw->buf);
+	nw->buf = new;
+	nw->hdr = (last_len > 0) ? (struct nlmsghdr *)new->data : NULL;
+	NL_LOG(LOG_DEBUG2, "switched buffer: used %u/%u bytes",
+	    new->datalen, new->buflen);

 	return (true);
 }
@ -660,17 +182,20 @@ bool
 _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
    uint16_t flags, uint32_t len)
 {
+	struct nl_buf *nb = nw->buf;
 	struct nlmsghdr *hdr;
+	u_int required_len;

 	MPASS(nw->hdr == NULL);

-	int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
-	if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
+	required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
+	if (__predict_false(nb->datalen + required_len > nb->buflen)) {
 		if (!nlmsg_refill_buffer(nw, required_len))
 			return (false);
+		nb = nw->buf;
 	}

-	hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
+	hdr = (struct nlmsghdr *)(&nb->data[nb->datalen]);

 	hdr->nlmsg_len = len;
 	hdr->nlmsg_type = type;
@ -679,7 +204,7 @@ _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
 	hdr->nlmsg_pid = portid;

 	nw->hdr = hdr;
-	nw->offset += sizeof(struct nlmsghdr);
+	nb->datalen += sizeof(struct nlmsghdr);

 	return (true);
 }
@ -687,6 +212,8 @@ _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
 bool
 _nlmsg_end(struct nl_writer *nw)
 {
+	struct nl_buf *nb = nw->buf;
+
 	MPASS(nw->hdr != NULL);

 	if (nw->enomem) {
@ -695,7 +222,7 @@ _nlmsg_end(struct nl_writer *nw)
 		return (false);
 	}

-	nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
+	nw->hdr->nlmsg_len = nb->data + nb->datalen - (char *)nw->hdr;
 	NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
 	    nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
 	    nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
@ -707,8 +234,10 @@ _nlmsg_end(struct nl_writer *nw)
 void
 _nlmsg_abort(struct nl_writer *nw)
 {
+	struct nl_buf *nb = nw->buf;
+
 	if (nw->hdr != NULL) {
-		nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
+		nb->datalen = (char *)nw->hdr - nb->data;
 		nw->hdr = NULL;
 	}
 }
@ -775,7 +304,7 @@ _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
 	/* Save operation result */
 	int *perror = nlmsg_reserve_object(nw, int);
 	NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
-	    nw->offset, perror);
+	    nw->buf->datalen, perror);
 	*perror = error;
 	nlmsg_end(nw);
 	nw->suppress_ack = true;
@ -787,40 +316,47 @@ _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
 * KPI functions.
 */

-int
+u_int
 nlattr_save_offset(const struct nl_writer *nw)
 {
-	return (nw->offset - ((char *)nw->hdr - nw->data));
+	return (nw->buf->datalen - ((char *)nw->hdr - nw->buf->data));
 }

 void *
 nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz)
 {
-	sz = NETLINK_ALIGN(sz);
+	struct nl_buf *nb = nw->buf;
+	void *data;

-	if (__predict_false(nw->offset + sz > nw->alloc_len)) {
+	sz = NETLINK_ALIGN(sz);
+	if (__predict_false(nb->datalen + sz > nb->buflen)) {
 		if (!nlmsg_refill_buffer(nw, sz))
 			return (NULL);
+		nb = nw->buf;
 	}

-	void *data_ptr = &nw->data[nw->offset];
-	nw->offset += sz;
-	bzero(data_ptr, sz);
+	data = &nb->data[nb->datalen];
+	bzero(data, sz);
+	nb->datalen += sz;

-	return (data_ptr);
+	return (data);
 }

 bool
 nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data)
 {
-	int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+	struct nl_buf *nb = nw->buf;
+	struct nlattr *nla;
+	u_int required_len;

-	if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
+	required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+	if (__predict_false(nb->datalen + required_len > nb->buflen)) {
 		if (!nlmsg_refill_buffer(nw, required_len))
 			return (false);
+		nb = nw->buf;
 	}

-	struct nlattr *nla = (struct nlattr *)(&nw->data[nw->offset]);
+	nla = (struct nlattr *)(&nb->data[nb->datalen]);

 	nla->nla_len = attr_len + sizeof(struct nlattr);
 	nla->nla_type = attr_type;
@ -831,7 +367,7 @@ nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data)
 		}
 		memcpy((nla + 1), data, attr_len);
 	}
-	nw->offset += required_len;
+	nb->datalen += required_len;
 	return (true);
 }

--- a/sys/netlink/netlink_message_writer.h
+++ b/sys/netlink/netlink_message_writer.h
@ -37,60 +37,41 @@
 * It is not meant to be included directly
 */

-struct mbuf;
+struct nl_buf;
 struct nl_writer;
-typedef bool nl_writer_cb(struct nl_writer *nw, void *buf, int buflen, int cnt);
+typedef bool nl_writer_cb(struct nl_writer *nw);

 struct nl_writer {
-	int			alloc_len;	/* allocated buffer length */
-	int			offset;		/* offset from the start of the buffer */
-	struct nlmsghdr		*hdr;		/* Pointer to the currently-filled msg */
-	char			*data;		/* pointer to the contiguous storage */
-	void			*_storage;	/* Underlying storage pointer */
-	nl_writer_cb		*cb;		/* Callback to flush data */
+	struct nl_buf		*buf;	/* Underlying storage pointer */
+	struct nlmsghdr		*hdr;	/* Pointer to the currently-filled msg */
+	nl_writer_cb		*cb;	/* Callback to flush data */
 	union {
-		void		*ptr;
+		struct nlpcb	*nlp;
 		struct {
 			uint16_t	proto;
 			uint16_t	id;
 		} group;
-	} arg;
-	int			num_messages;	/* Number of messages in the buffer */
-	int			malloc_flag;	/* M_WAITOK or M_NOWAIT */
-	uint8_t			writer_type;	/* NS_WRITER_TYPE_* */
-	uint8_t			writer_target;	/* NS_WRITER_TARGET_*  */
-	bool			ignore_limit;	/* If true, ignores RCVBUF limit */
-	bool			enomem;		/* True if ENOMEM occured */
-	bool			suppress_ack;	/* If true, don't send NLMSG_ERR */
+	};
+	u_int		num_messages;	/* Number of messages in the buffer */
+	int		malloc_flag;	/* M_WAITOK or M_NOWAIT */
+	bool		ignore_limit;	/* If true, ignores RCVBUF limit */
+	bool		enomem;		/* True if ENOMEM occured */
+	bool		suppress_ack;	/* If true, don't send NLMSG_ERR */
 };
-#define	NS_WRITER_TARGET_SOCKET	0
-#define	NS_WRITER_TARGET_GROUP	1
-#define	NS_WRITER_TARGET_CHAIN	2
-
-#define	NS_WRITER_TYPE_MBUF	0
-#define NS_WRITER_TYPE_BUF	1
-#define NS_WRITER_TYPE_LBUF	2
-#define NS_WRITER_TYPE_MBUFC	3
-#define NS_WRITER_TYPE_STUB	4
-

 #define	NLMSG_SMALL	128
 #define	NLMSG_LARGE	2048

 /* Message and attribute writing */
-
-struct nlpcb;
-
 #if defined(NETLINK) || defined(NETLINK_MODULE)
 /* Provide optimized calls to the functions inside the same linking unit */

 bool _nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp);
 bool _nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id);
-bool _nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm);
 bool _nlmsg_flush(struct nl_writer *nw);
 void _nlmsg_ignore_limit(struct nl_writer *nw);

-bool _nlmsg_refill_buffer(struct nl_writer *nw, int required_size);
+bool _nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len);
 bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
    uint16_t flags, uint32_t len);
 bool _nlmsg_end(struct nl_writer *nw);
@ -111,12 +92,6 @@ nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int g
 	return (_nlmsg_get_group_writer(nw, expected_size, proto, group_id));
 }

-static inline bool
-nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm)
-{
-	return (_nlmsg_get_chain_writer(nw, expected_size, pm));
-}
-
 static inline bool
 nlmsg_flush(struct nl_writer *nw)
 {
@ -186,8 +161,6 @@ nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len)
 	    hdr->nlmsg_flags, payload_len));
 }

-#define nlmsg_data(_hdr)	((void *)((_hdr) + 1))
-
 /*
 * KPI similar to mtodo():
 * current (uncompleted) header is guaranteed to be contiguous,
--- a/sys/netlink/netlink_module.c
+++ b/sys/netlink/netlink_module.c
@ -181,7 +181,6 @@ const static struct nl_function_wrapper nl_module = {
 	.nlmsg_abort = _nlmsg_abort,
 	.nlmsg_get_unicast_writer = _nlmsg_get_unicast_writer,
 	.nlmsg_get_group_writer = _nlmsg_get_group_writer,
-	.nlmsg_get_chain_writer = _nlmsg_get_chain_writer,
 	.nlmsg_end_dump = _nlmsg_end_dump,
 	.nl_modify_ifp_generic = _nl_modify_ifp_generic,
 	.nl_store_ifp_cookie = _nl_store_ifp_cookie,
@ -219,7 +218,6 @@ netlink_modevent(module_t mod __unused, int what, void *priv __unused)
 	switch (what) {
 	case MOD_LOAD:
 		NL_LOG(LOG_DEBUG2, "Loading");
-		nl_init_msg_zone();
 		nl_osd_register();
 #if !defined(NETLINK) && defined(NETLINK_MODULE)
 		nl_set_functions(&nl_module);
@ -235,7 +233,6 @@ netlink_modevent(module_t mod __unused, int what, void *priv __unused)
 			nl_set_functions(NULL);
 #endif
 			nl_osd_unregister();
-			nl_destroy_msg_zone();
 		} else
 			ret = EBUSY;
 		break;
--- a/sys/netlink/netlink_var.h
+++ b/sys/netlink/netlink_var.h
@ -43,14 +43,9 @@

 struct ucred;

-struct nl_io_queue {
-	STAILQ_HEAD(, mbuf)	head;
-	int			length;
-	int			hiwat;
-};
-
 struct nl_buf {
 	TAILQ_ENTRY(nl_buf)	tailq;
+	struct mbuf		*control;
 	u_int			buflen;
 	u_int			datalen;
 	u_int			offset;
@ -72,7 +67,6 @@ struct nlpcb {
 	bool			nl_linux; /* true if running under compat */
 	bool			nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */
 	bool			nl_need_thread_setup;
-	struct nl_io_queue	tx_queue;
 	struct taskqueue	*nl_taskqueue;
 	struct task		nl_task;
 	struct ucred		*nl_cred; /* Copy of nl_socket->so_cred */
@ -131,7 +125,7 @@ struct nl_proto_handler {
 extern struct nl_proto_handler *nl_handlers;

 /* netlink_domain.c */
-void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id);
+bool nl_send_group(struct nl_writer *);
 void nl_osd_register(void);
 void nl_osd_unregister(void);
 void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp);
@ -139,22 +133,18 @@ void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp);
 /* netlink_io.c */
 #define	NL_IOF_UNTRANSLATED	0x01
 #define	NL_IOF_IGNORE_LIMIT	0x02
-bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags);
+bool nl_send_one(struct nl_writer *);
 void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg,
    struct nl_pstate *npt);
 void nl_on_transmit(struct nlpcb *nlp);
-void nl_init_io(struct nlpcb *nlp);
-void nl_free_io(struct nlpcb *nlp);

 void nl_taskqueue_handler(void *_arg, int pending);
 void nl_schedule_taskqueue(struct nlpcb *nlp);
 void nl_process_receive_locked(struct nlpcb *nlp);
 void nl_set_source_metadata(struct mbuf *m, int num_messages);
-void nl_add_msg_info(struct mbuf *m);
-
-/* netlink_message_writer.c */
-void nl_init_msg_zone(void);
-void nl_destroy_msg_zone(void);
+void nl_add_msg_info(struct nl_buf *nb);
+struct nl_buf *nl_buf_alloc(size_t len, int mflag);
+void nl_buf_free(struct nl_buf *nb);

 /* netlink_generic.c */
 struct genl_family {
--- a/sys/netlink/route/rt.c
+++ b/sys/netlink/route/rt.c
@ -556,9 +556,8 @@ dump_rtentry(struct rtentry *rt, void *_arg)
 	IF_DEBUG_LEVEL(LOG_DEBUG3) {
 		char rtbuf[INET6_ADDRSTRLEN + 5];
 		FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
-		    "Dump %s, offset %u, error %d",
-		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
-		    wa->nw->offset, error);
+		    "Dump %s, error %d",
+		    rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error);
 	}
 	wa->error = error;

@ -578,7 +577,6 @@ dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)

 	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
 	    wa->count, wa->dumped);
-	NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
 }

 static int
--- a/tests/sys/netlink/test_netlink_message_writer.py
+++ b/tests/sys/netlink/test_netlink_message_writer.py
@ -4,19 +4,11 @@
 from atf_python.ktest import BaseKernelTest
 from atf_python.sys.netlink.attrs import NlAttrU32

-
 M_NOWAIT = 1
 M_WAITOK = 2
-NS_WRITER_TYPE_MBUF = 0
-NS_WRITER_TYPE_BUF = 1
-NS_WRITER_TYPE_LBUF = 1
-
-MHLEN = 160
-MCLBYTES = 2048  # XXX: may differ on some archs?
-MJUMPAGESIZE = mmap.PAGESIZE
-MJUM9BYTES = 9 * 1024
-MJUM16BYTES = 16 * 1024

+NLMSG_SMALL = 128
+NLMSG_LARGE = 2048

 class TestNetlinkMessageWriter(BaseKernelTest):
    KTEST_MODULE_NAME = "ktest_netlink_message_writer"
@ -28,52 +20,20 @@ class TestNetlinkMessageWriter(BaseKernelTest):
            pytest.param(M_WAITOK, id="WAITOK"),
        ],
    )
-    @pytest.mark.parametrize(
-        "writer_type",
-        [
-            pytest.param(NS_WRITER_TYPE_MBUF, id="MBUF"),
-            pytest.param(NS_WRITER_TYPE_BUF, id="BUF"),
-        ],
-    )
    @pytest.mark.parametrize(
        "sz",
        [
-            pytest.param([160, 160], id="MHLEN"),
-            pytest.param([MCLBYTES, MCLBYTES], id="MCLBYTES"),
+            pytest.param([NLMSG_SMALL, NLMSG_SMALL], id="NLMSG_SMALL"),
+            pytest.param([NLMSG_LARGE, NLMSG_LARGE], id="NLMSG_LARGE"),
+            pytest.param([NLMSG_LARGE + 256, NLMSG_LARGE + 256], id="NLMSG_LARGE+256"),
        ],
    )
-    def test_mbuf_writer_allocation(self, sz, writer_type, malloc_flags):
+    def test_nlbuf_writer_allocation(self, sz, malloc_flags):
        """override to parametrize"""

        test_meta = [
            NlAttrU32(1, sz[0]),  # size
            NlAttrU32(2, sz[1]),  # expected_avail
-            NlAttrU32(4, writer_type),
-            NlAttrU32(5, malloc_flags),
-        ]
-        self.runtest(test_meta)
-
-    @pytest.mark.parametrize(
-        "malloc_flags",
-        [
-            pytest.param(M_NOWAIT, id="NOWAIT"),
-            pytest.param(M_WAITOK, id="WAITOK"),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "sz",
-        [
-            pytest.param([160, 160, 1], id="MHLEN"),
-            pytest.param([MCLBYTES, MCLBYTES, 1], id="MCLBYTES"),
-            pytest.param([MCLBYTES + 1, MCLBYTES + 1, 2], id="MCLBYTES_MHLEN"),
-            pytest.param([MCLBYTES + 256, MCLBYTES * 2, 2], id="MCLBYTESx2"),
-        ],
-    )
-    def test_mbuf_chain_allocation(self, sz, malloc_flags):
-        test_meta = [
-            NlAttrU32(1, sz[0]),  # size
-            NlAttrU32(2, sz[1]),  # expected_avail
-            NlAttrU32(3, sz[2]),  # expected_count
-            NlAttrU32(5, malloc_flags),
+            NlAttrU32(3, malloc_flags),
        ]
        self.runtest(test_meta)