2 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_syslink.c,v 1.6 2007/03/21 20:06:34 dillon Exp $
37 * This module implements the syslink() system call and protocol which
38 * is used to glue clusters together as well as to interface userland
39 * devices and filesystems to the kernel.
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
50 #include <sys/thread.h>
52 #include <sys/sysctl.h>
53 #include <sys/sysproto.h>
54 #include <sys/syslink.h>
55 #include <sys/syslink_msg.h>
57 #include <sys/thread2.h>
60 * Red-Black trees organizing the syslink 'router' nodes and connections
66 RB_HEAD(slrouter_rb_tree, slrouter);
67 RB_HEAD(sldata_rb_tree, sldata);
68 RB_PROTOTYPE2(slrouter_rb_tree, slrouter, rbnode,
69 rb_slrouter_compare, sysid_t);
70 RB_PROTOTYPE2(sldata_rb_tree, sldata, rbnode,
71 rb_sldata_compare, int);
74 RB_ENTRY(slrouter) rbnode; /* list of routers */
75 struct sldata_rb_tree sldata_rb_root; /* connections to router */
76 sysid_t logid; /* logical sysid of router */
77 int flags; /* flags passed on create */
78 int phybits; /* accomodate connections */
79 int count; /* number of connections */
80 int nextphysid; /* next physid to allocate */
85 * fileops interface. slbuf and sldata are also used in conjunction with a
86 * normal file descriptor.
90 int bufsize; /* must be a power of 2 */
91 int bufmask; /* (bufsize - 1) */
92 int rindex; /* tail-chasing FIFO indices */
97 RB_ENTRY(sldata) rbnode;
98 struct slrouter *router; /* organizing router */
101 struct file *xfp; /* external file pointer */
102 struct lock rlock; /* synchronizing lock */
103 struct lock wlock; /* synchronizing lock */
104 struct thread *rthread; /* xfp -> rbuf & process */
105 struct thread *wthread; /* wbuf -> xfp */
106 int flags; /* connection flags */
111 #define SLF_RQUIT 0x0001
112 #define SLF_WQUIT 0x0002
113 #define SLF_RDONE 0x0004
114 #define SLF_WDONE 0x0008
116 #define SYSLINK_BUFSIZE (128*1024)
118 static int rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2);
119 static int rb_sldata_compare(struct sldata *d1, struct sldata *d2);
121 static int syslink_read (struct file *fp, struct uio *uio,
122 struct ucred *cred, int flags);
123 static int syslink_write (struct file *fp, struct uio *uio,
124 struct ucred *cred, int flags);
125 static int syslink_close (struct file *fp);
126 static int syslink_stat (struct file *fp, struct stat *sb, struct ucred *cred);
127 static int syslink_shutdown (struct file *fp, int how);
128 static int syslink_ioctl (struct file *fp, u_long cmd, caddr_t data,
130 static int syslink_poll (struct file *fp, int events, struct ucred *cred);
131 static int syslink_kqfilter(struct file *fp, struct knote *kn);
133 static void syslink_rthread(void *arg);
134 static void syslink_wthread(void *arg);
135 static void slbuf_alloc(struct slbuf *buf, int bytes);
136 static void slbuf_free(struct slbuf *buf);
137 static void sldata_rels(struct sldata *sldata);
138 static void slrouter_rels(struct slrouter *slrouter);
139 static int process_syslink_msg(struct sldata *sldata, struct syslink_msg *head);
140 static int syslink_validate(struct syslink_msg *head, int bytes);
142 RB_GENERATE2(slrouter_rb_tree, slrouter, rbnode,
143 rb_slrouter_compare, sysid_t, logid);
144 RB_GENERATE2(sldata_rb_tree, sldata, rbnode,
145 rb_sldata_compare, int, physid);
147 static struct fileops syslinkops = {
148 .fo_read = syslink_read,
149 .fo_write = syslink_write,
150 .fo_ioctl = syslink_ioctl,
151 .fo_poll = syslink_poll,
152 .fo_kqfilter = syslink_kqfilter,
153 .fo_stat = syslink_stat,
154 .fo_close = syslink_close,
155 .fo_shutdown = syslink_shutdown
158 MALLOC_DEFINE(M_SYSLINK, "syslink", "syslink manager");
160 static int syslink_enabled;
161 SYSCTL_INT(_kern, OID_AUTO, syslink_enabled,
162 CTLFLAG_RW, &syslink_enabled, 0, "Enable SYSLINK");
165 * Support declarations and compare function for our RB trees
167 static struct slrouter_rb_tree slrouter_rb_root;
170 rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2)
172 if (r1->logid < r2->logid)
174 if (r1->logid > r2->logid)
180 rb_sldata_compare(struct sldata *d1, struct sldata *d2)
182 if (d1->physid < d2->physid)
184 if (d1->physid > d2->physid)
190 * Primary system call interface - associate a full-duplex stream
191 * (typically a pipe or a connected socket) with a sysid namespace,
192 * or create a direct link.
194 * syslink(int fd, int flags, sysid_t routenode)
198 sys_syslink(struct syslink_args *uap)
200 struct slrouter *slrouter;
201 struct slrouter *slnew;
202 struct sldata *sldata;
210 * System call is under construction and disabled by default
212 if (syslink_enabled == 0)
214 error = suser(curthread);
219 * Lookup or create the route node using passed flags.
221 slnew = kmalloc(sizeof(struct slrouter), M_SYSLINK, M_WAITOK|M_ZERO);
222 slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, uap->routenode);
225 * Existing route node
227 if (uap->flags & SYSLINKF_EXCL) {
228 kfree(slnew, M_SYSLINK);
232 kfree(slnew, M_SYSLINK);
233 } else if ((uap->flags & SYSLINKF_CREAT) == 0) {
235 * Non-existent, no create flag specified
237 kfree(slnew, M_SYSLINK);
241 * Create a new route node. Cannot block prior to tree insertion.
243 * Check the number of bits of physical id this route node can
244 * dispense for validity. The number of connections allowed must
245 * fit in a signed 32 bit integer.
247 int phybits = uap->flags & SYSLINKF_PHYSBITS;
249 if (phybits < 2 || phybits > 31) {
250 kfree(slnew, M_SYSLINK);
253 slnew->logid = uap->routenode;
255 slnew->phybits = phybits;
256 slnew->flags = uap->flags;
257 RB_INSERT(slrouter_rb_tree, &slrouter_rb_root, slnew);
258 RB_INIT(&slnew->sldata_rb_root);
261 numphys = 1 << slrouter->phybits;
264 * Create a connection to the route node and allocate a physical ID.
265 * Physical ID 0 is reserved for the route node itself.
267 sldata = kmalloc(sizeof(struct sldata), M_SYSLINK, M_WAITOK|M_ZERO);
269 if (slrouter->count + 1 >= numphys) {
271 kfree(sldata, M_SYSLINK);
274 physid = slrouter->nextphysid;
275 for (n = 0; n < numphys; ++n) {
276 if (++physid == numphys)
278 if (sldata_rb_tree_RB_LOOKUP(&slrouter->sldata_rb_root, physid) == NULL)
282 panic("sys_syslink: unexpected physical id allocation failure");
285 * Insert the node, initializing enough fields to prevent things from
286 * being ripped out from under us before we have a chance to complete
289 slrouter->nextphysid = physid;
290 sldata->physid = physid;
293 RB_INSERT(sldata_rb_tree, &slrouter->sldata_rb_root, sldata);
296 * Complete initialization of the physical route node. Setting
297 * sldata->router activates the node.
299 lockinit(&sldata->rlock, "slread", 0, 0);
300 lockinit(&sldata->wlock, "slwrite", 0, 0);
304 * We create a direct syslink descriptor. Only the reader thread
307 error = falloc(curproc, &fp, &uap->fd);
309 fp->f_type = DTYPE_SYSLINK;
310 fp->f_flag = FREAD | FWRITE;
311 fp->f_ops = &syslinkops;
313 slbuf_alloc(&sldata->rbuf, SYSLINK_BUFSIZE);
314 slbuf_alloc(&sldata->wbuf, SYSLINK_BUFSIZE);
315 sldata->refs += 2; /* reader thread and descriptor */
316 sldata->flags = SLF_WQUIT | SLF_WDONE;
317 lwkt_create(syslink_rthread, sldata,
318 &sldata->rthread, NULL,
320 fsetfd(curproc, fp, uap->fd);
322 uap->sysmsg_result = uap->fd;
325 sldata->xfp = holdfp(curproc->p_fd, uap->fd, -1);
326 if (sldata->xfp != NULL) {
327 slbuf_alloc(&sldata->rbuf, SYSLINK_BUFSIZE);
328 slbuf_alloc(&sldata->wbuf, SYSLINK_BUFSIZE);
329 sldata->refs += 2; /* reader thread and writer thread */
330 lwkt_create(syslink_rthread, sldata,
331 &sldata->rthread, NULL,
333 lwkt_create(syslink_wthread, sldata,
334 &sldata->wthread, NULL,
340 sldata->router = slrouter;
343 slrouter_rels(slrouter);
348 * This thread reads from an external descriptor into rbuf, then parses and
349 * dispatches syslink messages from rbuf.
353 syslink_rthread(void *arg)
355 struct sldata *sldata = arg;
356 struct slbuf *slbuf = &sldata->rbuf;
357 struct syslink_msg *head;
358 const int min_msg_size = offsetof(struct syslink_msg, sm_srcid);
360 while ((sldata->flags & SLF_RQUIT) == 0) {
366 * Calculate contiguous space available to read and read as much
369 * If the entire buffer is used there's probably a format error
370 * of some sort and we terminate the link.
372 used = slbuf->windex - slbuf->rindex;
376 * Read some data, terminate the link if an error occurs or if EOF
377 * is encountered. xfp can be NULL, indicating that the data was
378 * injected by other means.
381 count = slbuf->bufsize - (slbuf->windex & slbuf->bufmask);
382 if (count > slbuf->bufsize - used)
383 count = slbuf->bufsize - used;
386 error = fp_read(sldata->xfp,
387 slbuf->buf + (slbuf->windex & slbuf->bufmask),
388 count, &count, 0, UIO_SYSSPACE);
393 slbuf->windex += count;
396 tsleep(slbuf, 0, "fiford", 0);
400 * Process as many syslink messages as we can. The record length
401 * must be at least a minimal PAD record (8 bytes). A sm_cmd of 0
404 while (slbuf->windex - slbuf->rindex >= min_msg_size) {
407 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
408 if (head->sm_bytes < min_msg_size) {
412 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
417 if ((slbuf->rindex & slbuf->bufmask) >
418 ((slbuf->rindex + aligned_reclen) & slbuf->bufmask)
425 * Insufficient data read
427 if (slbuf->windex - slbuf->rindex < aligned_reclen)
431 * Process non-pad messages. Non-pad messages have to be at
432 * least the size of the syslink_msg structure.
435 if (head->sm_bytes < sizeof(struct syslink_msg)) {
439 error = process_syslink_msg(sldata, head);
444 slbuf->rindex += aligned_reclen;
451 * Mark us as done and deref sldata. Tell the writer to terminate as
454 sldata->flags |= SLF_RDONE;
455 if ((sldata->flags & SLF_WDONE) == 0) {
456 sldata->flags |= SLF_WQUIT;
457 wakeup(&sldata->wbuf);
459 wakeup(&sldata->rbuf);
460 wakeup(&sldata->wbuf);
465 * This thread takes outgoing syslink messages queued to wbuf and writes them
466 * to the descriptor. PAD is stripped. PAD is also added as required to
467 * conform to the outgoing descriptor's buffering requirements.
471 syslink_wthread(void *arg)
473 struct sldata *sldata = arg;
474 struct slbuf *slbuf = &sldata->wbuf;
475 struct syslink_msg *head;
478 while ((sldata->flags & SLF_WQUIT) == 0) {
485 used = slbuf->windex - slbuf->rindex;
486 if (used < offsetof(struct syslink_msg, sm_srcid))
489 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
490 if (head->sm_bytes < offsetof(struct syslink_msg, sm_srcid)) {
494 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
499 if ((slbuf->rindex & slbuf->bufmask) >
500 ((slbuf->rindex + aligned_reclen) & slbuf->bufmask)
507 * Insufficient data read
509 if (used < aligned_reclen)
513 * Write it out whether it is PAD or not. XXX re-PAD for output
516 error = fp_write(sldata->xfp, head, aligned_reclen, &count,
520 if (count != aligned_reclen) {
524 slbuf->rindex += aligned_reclen;
528 tsleep(slbuf, 0, "fifowt", 0);
530 sldata->flags |= SLF_WDONE;
536 slbuf_alloc(struct slbuf *slbuf, int bytes)
538 bzero(slbuf, sizeof(*slbuf));
539 slbuf->buf = kmalloc(bytes, M_SYSLINK, M_WAITOK);
540 slbuf->bufsize = bytes;
541 slbuf->bufmask = bytes - 1;
546 slbuf_free(struct slbuf *slbuf)
548 kfree(slbuf->buf, M_SYSLINK);
554 sldata_rels(struct sldata *sldata)
556 struct slrouter *slrouter;
558 if (--sldata->refs == 0) {
559 slrouter = sldata->router;
560 KKASSERT(slrouter != NULL);
562 RB_REMOVE(sldata_rb_tree, &sldata->router->sldata_rb_root, sldata);
563 sldata->router = NULL;
564 slbuf_free(&sldata->rbuf);
565 slbuf_free(&sldata->wbuf);
566 kfree(sldata, M_SYSLINK);
567 slrouter_rels(slrouter);
573 slrouter_rels(struct slrouter *slrouter)
575 if (--slrouter->refs == 0 && RB_EMPTY(&slrouter->sldata_rb_root)) {
576 RB_REMOVE(slrouter_rb_tree, &slrouter_rb_root, slrouter);
577 kfree(slrouter, M_SYSLINK);
582 * fileops for an established syslink when the kernel is asked to create a
583 * descriptor (verses one being handed to it). No threads are created in
588 * Transfer zero or more messages from the kernel to userland. Only complete
589 * messages are returned. If the uio has insufficient space then EMSGSIZE
590 * is returned. The kernel feeds messages to wbuf so we use wlock (structures
591 * are relative to the kernel).
595 syslink_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
597 struct sldata *sldata = fp->f_data;
598 struct slbuf *slbuf = &sldata->wbuf;
599 struct syslink_msg *head;
605 if (flags & O_FBLOCKING)
607 else if (flags & O_FNONBLOCKING)
609 else if (fp->f_flag & O_NONBLOCK)
614 lockmgr(&sldata->wlock, LK_EXCLUSIVE | LK_RETRY);
617 * Calculate the number of bytes we can transfer in one shot. Transfers
618 * do not wrap the FIFO.
620 contig = slbuf->bufsize - (slbuf->rindex & slbuf->bufmask);
622 bytes = slbuf->windex - slbuf->rindex;
625 if (sldata->flags & SLF_RDONE) {
633 tsleep(slbuf, 0, "fiford", 0);
639 * The uio must be able to accomodate the transfer.
641 if (uio->uio_resid < bytes) {
647 * Copy the data to userland and update rindex.
649 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
650 error = uiomove((caddr_t)head, bytes, uio);
652 slbuf->rindex += bytes;
658 lockmgr(&sldata->wlock, LK_RELEASE);
663 * Transfer zero or more messages from userland to the kernel. Only complete
664 * messages may be written. The kernel processes from rbuf so that is where
665 * we have to copy the messages.
669 syslink_write (struct file *fp, struct uio *uio, struct ucred *cred, int flags)
671 struct sldata *sldata = fp->f_data;
672 struct slbuf *slbuf = &sldata->rbuf;
673 struct syslink_msg *head;
679 if (flags & O_FBLOCKING)
681 else if (flags & O_FNONBLOCKING)
683 else if (fp->f_flag & O_NONBLOCK)
688 lockmgr(&sldata->rlock, LK_EXCLUSIVE | LK_RETRY);
691 * Calculate the maximum number of contiguous bytes that may be available.
692 * Caller is required to not wrap our FIFO.
694 contig = slbuf->bufsize - (slbuf->windex & slbuf->bufmask);
695 if (uio->uio_resid > contig) {
701 * Truncate based on actual unused space available in the FIFO. If
702 * the uio does not fit, block and loop.
705 bytes = slbuf->bufsize - (slbuf->windex - slbuf->rindex);
708 if (uio->uio_resid <= bytes)
710 if (sldata->flags & SLF_RDONE) {
718 tsleep(slbuf, 0, "fifowr", 0);
720 bytes = uio->uio_resid;
721 head = (void *)(slbuf->buf + (slbuf->windex & slbuf->bufmask));
722 error = uiomove((caddr_t)head, bytes, uio);
724 error = syslink_validate(head, bytes);
726 slbuf->windex += bytes;
730 lockmgr(&sldata->rlock, LK_RELEASE);
736 syslink_close (struct file *fp)
738 struct sldata *sldata;
741 if ((sldata->flags & SLF_RQUIT) == 0) {
742 sldata->flags |= SLF_RQUIT;
743 wakeup(&sldata->rbuf);
745 if ((sldata->flags & SLF_WQUIT) == 0) {
746 sldata->flags |= SLF_WQUIT;
747 wakeup(&sldata->wbuf);
756 syslink_stat (struct file *fp, struct stat *sb, struct ucred *cred)
763 syslink_shutdown (struct file *fp, int how)
770 syslink_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred)
777 syslink_poll (struct file *fp, int events, struct ucred *cred)
784 syslink_kqfilter(struct file *fp, struct knote *kn)
790 * This routine is called from a route node's reader thread to process a
791 * syslink message once it has been completely read and its size validated.
795 process_syslink_msg(struct sldata *sldata, struct syslink_msg *head)
797 kprintf("process syslink msg %08x\n", head->sm_cmd);
802 * Validate that the syslink message header(s) are correctly sized.
806 syslink_validate(struct syslink_msg *head, int bytes)
808 const int min_msg_size = offsetof(struct syslink_msg, sm_srcid);
813 * Message size and alignment
815 if (bytes < min_msg_size)
817 if (bytes & SL_ALIGNMASK)
819 if (head->sm_cmd && bytes < sizeof(struct syslink_msg))
823 * Buffer must contain entire record
825 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
826 if (bytes < aligned_reclen)
828 bytes -= aligned_reclen;
829 head = (void *)((char *)head + aligned_reclen);