2 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_syslink.c,v 1.7 2007/03/24 19:11:14 dillon Exp $
37 * This module implements the syslink() system call and protocol which
38 * is used to glue clusters together as well as to interface userland
39 * devices and filesystems to the kernel.
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
50 #include <sys/thread.h>
52 #include <sys/sysctl.h>
53 #include <sys/sysproto.h>
54 #include <sys/syslink.h>
55 #include <sys/syslink_msg.h>
57 #include <sys/thread2.h>
60 * Red-Black trees organizing the syslink 'router' nodes and connections
66 RB_HEAD(slrouter_rb_tree, slrouter);
67 RB_HEAD(sldata_rb_tree, sldata);
68 RB_PROTOTYPE2(slrouter_rb_tree, slrouter, rbnode,
69 rb_slrouter_compare, sysid_t);
70 RB_PROTOTYPE2(sldata_rb_tree, sldata, rbnode,
71 rb_sldata_compare, int);
74 RB_ENTRY(slrouter) rbnode; /* list of routers */
75 struct sldata_rb_tree sldata_rb_root; /* connections to router */
76 sysid_t logid; /* logical sysid of router */
77 int flags; /* flags passed on create */
78 int phybits; /* accomodate connections */
79 int count; /* number of connections */
80 int nextphysid; /* next physid to allocate */
85 * fileops interface. slbuf and sldata are also used in conjunction with a
86 * normal file descriptor.
90 int bufsize; /* must be a power of 2 */
91 int bufmask; /* (bufsize - 1) */
92 int rindex; /* tail-chasing FIFO indices */
97 RB_ENTRY(sldata) rbnode;
98 struct slrouter *router; /* organizing router */
101 struct file *xfp; /* external file pointer */
102 struct lock rlock; /* synchronizing lock */
103 struct lock wlock; /* synchronizing lock */
104 struct thread *rthread; /* xfp -> rbuf & process */
105 struct thread *wthread; /* wbuf -> xfp */
106 int flags; /* connection flags */
111 #define SLF_RQUIT 0x0001
112 #define SLF_WQUIT 0x0002
113 #define SLF_RDONE 0x0004
114 #define SLF_WDONE 0x0008
116 #define SYSLINK_BUFSIZE (128*1024)
118 static int rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2);
119 static int rb_sldata_compare(struct sldata *d1, struct sldata *d2);
121 static int syslink_read (struct file *fp, struct uio *uio,
122 struct ucred *cred, int flags);
123 static int syslink_write (struct file *fp, struct uio *uio,
124 struct ucred *cred, int flags);
125 static int syslink_close (struct file *fp);
126 static int syslink_stat (struct file *fp, struct stat *sb, struct ucred *cred);
127 static int syslink_shutdown (struct file *fp, int how);
128 static int syslink_ioctl (struct file *fp, u_long cmd, caddr_t data,
130 static int syslink_poll (struct file *fp, int events, struct ucred *cred);
131 static int syslink_kqfilter(struct file *fp, struct knote *kn);
133 static void syslink_rthread(void *arg);
134 static void syslink_wthread(void *arg);
135 static void slbuf_alloc(struct slbuf *buf, int bytes);
136 static void slbuf_free(struct slbuf *buf);
137 static void sldata_rels(struct sldata *sldata);
138 static void slrouter_rels(struct slrouter *slrouter);
139 static int process_syslink_msg(struct sldata *sldata, struct syslink_msg *head);
140 static int syslink_validate(struct syslink_msg *head, int bytes);
142 RB_GENERATE2(slrouter_rb_tree, slrouter, rbnode,
143 rb_slrouter_compare, sysid_t, logid);
144 RB_GENERATE2(sldata_rb_tree, sldata, rbnode,
145 rb_sldata_compare, int, physid);
147 static struct fileops syslinkops = {
148 .fo_read = syslink_read,
149 .fo_write = syslink_write,
150 .fo_ioctl = syslink_ioctl,
151 .fo_poll = syslink_poll,
152 .fo_kqfilter = syslink_kqfilter,
153 .fo_stat = syslink_stat,
154 .fo_close = syslink_close,
155 .fo_shutdown = syslink_shutdown
158 MALLOC_DEFINE(M_SYSLINK, "syslink", "syslink manager");
160 static int syslink_enabled;
161 SYSCTL_INT(_kern, OID_AUTO, syslink_enabled,
162 CTLFLAG_RW, &syslink_enabled, 0, "Enable SYSLINK");
165 * Support declarations and compare function for our RB trees
167 static struct slrouter_rb_tree slrouter_rb_root;
170 rb_slrouter_compare(struct slrouter *r1, struct slrouter *r2)
172 if (r1->logid < r2->logid)
174 if (r1->logid > r2->logid)
180 rb_sldata_compare(struct sldata *d1, struct sldata *d2)
182 if (d1->physid < d2->physid)
184 if (d1->physid > d2->physid)
190 * Primary system call interface - associate a full-duplex stream
191 * (typically a pipe or a connected socket) with a sysid namespace,
192 * or create a direct link.
194 * syslink(int fd, int flags, sysid_t routenode)
198 sys_syslink(struct syslink_args *uap)
200 struct slrouter *slrouter;
201 struct slrouter *slnew;
202 struct sldata *sldata;
210 * System call is under construction and disabled by default
212 if (syslink_enabled == 0)
214 error = suser(curthread);
219 * Lookup or create the route node using passed flags.
221 slnew = kmalloc(sizeof(struct slrouter), M_SYSLINK, M_WAITOK|M_ZERO);
222 slrouter = slrouter_rb_tree_RB_LOOKUP(&slrouter_rb_root, uap->routenode);
225 * Existing route node
227 if (uap->flags & SYSLINKF_EXCL) {
228 kfree(slnew, M_SYSLINK);
232 kfree(slnew, M_SYSLINK);
233 } else if ((uap->flags & SYSLINKF_CREAT) == 0) {
235 * Non-existent, no create flag specified
237 kfree(slnew, M_SYSLINK);
241 * Create a new route node. Cannot block prior to tree insertion.
243 * Check the number of bits of physical id this route node can
244 * dispense for validity. The number of connections allowed must
245 * fit in a signed 32 bit integer.
247 int phybits = uap->flags & SYSLINKF_PHYSBITS;
249 if (phybits < 2 || phybits > 31) {
250 kfree(slnew, M_SYSLINK);
253 slnew->logid = uap->routenode;
255 slnew->phybits = phybits;
256 slnew->flags = uap->flags;
257 RB_INSERT(slrouter_rb_tree, &slrouter_rb_root, slnew);
258 RB_INIT(&slnew->sldata_rb_root);
261 numphys = 1 << slrouter->phybits;
264 * Create a connection to the route node and allocate a physical ID.
265 * Physical ID 0 is reserved for the route node itself.
267 sldata = kmalloc(sizeof(struct sldata), M_SYSLINK, M_WAITOK|M_ZERO);
269 if (slrouter->count + 1 >= numphys) {
271 kfree(sldata, M_SYSLINK);
274 physid = slrouter->nextphysid;
275 for (n = 0; n < numphys; ++n) {
276 if (++physid == numphys)
278 if (sldata_rb_tree_RB_LOOKUP(&slrouter->sldata_rb_root, physid) == NULL)
282 panic("sys_syslink: unexpected physical id allocation failure");
285 * Insert the node, initializing enough fields to prevent things from
286 * being ripped out from under us before we have a chance to complete
289 slrouter->nextphysid = physid;
290 sldata->physid = physid;
293 RB_INSERT(sldata_rb_tree, &slrouter->sldata_rb_root, sldata);
296 * Complete initialization of the physical route node. Setting
297 * sldata->router activates the node.
299 lockinit(&sldata->rlock, "slread", 0, 0);
300 lockinit(&sldata->wlock, "slwrite", 0, 0);
304 * We create a direct syslink descriptor. Only the reader thread
307 error = falloc(curproc, &fp, &uap->fd);
309 fp->f_type = DTYPE_SYSLINK;
310 fp->f_flag = FREAD | FWRITE;
311 fp->f_ops = &syslinkops;
313 slbuf_alloc(&sldata->rbuf, SYSLINK_BUFSIZE);
314 slbuf_alloc(&sldata->wbuf, SYSLINK_BUFSIZE);
315 sldata->refs += 2; /* reader thread and descriptor */
316 sldata->flags = SLF_WQUIT | SLF_WDONE;
317 lwkt_create(syslink_rthread, sldata,
318 &sldata->rthread, NULL,
320 fsetfd(curproc, fp, uap->fd);
322 uap->sysmsg_result = uap->fd;
325 sldata->xfp = holdfp(curproc->p_fd, uap->fd, -1);
326 if (sldata->xfp != NULL) {
327 slbuf_alloc(&sldata->rbuf, SYSLINK_BUFSIZE);
328 slbuf_alloc(&sldata->wbuf, SYSLINK_BUFSIZE);
329 sldata->refs += 2; /* reader thread and writer thread */
330 lwkt_create(syslink_rthread, sldata,
331 &sldata->rthread, NULL,
333 lwkt_create(syslink_wthread, sldata,
334 &sldata->wthread, NULL,
340 sldata->router = slrouter;
343 slrouter_rels(slrouter);
348 * This thread reads from an external descriptor into rbuf, then parses and
349 * dispatches syslink messages from rbuf.
353 syslink_rthread(void *arg)
355 struct sldata *sldata = arg;
356 struct slbuf *slbuf = &sldata->rbuf;
357 struct syslink_msg *head;
358 const int min_msg_size = SL_MIN_MESSAGE_SIZE;
360 while ((sldata->flags & SLF_RQUIT) == 0) {
366 * Calculate contiguous space available to read and read as much
369 * If the entire buffer is used there's probably a format error
370 * of some sort and we terminate the link.
372 used = slbuf->windex - slbuf->rindex;
376 * Read some data, terminate the link if an error occurs or if EOF
377 * is encountered. xfp can be NULL, indicating that the data was
378 * injected by other means.
381 count = slbuf->bufsize - (slbuf->windex & slbuf->bufmask);
382 if (count > slbuf->bufsize - used)
383 count = slbuf->bufsize - used;
386 error = fp_read(sldata->xfp,
387 slbuf->buf + (slbuf->windex & slbuf->bufmask),
388 count, &count, 0, UIO_SYSSPACE);
393 slbuf->windex += count;
396 tsleep(slbuf, 0, "fiford", 0);
400 * Process as many syslink messages as we can. The record length
401 * must be at least a minimal PAD record (8 bytes). A sm_cmd of 0
404 while (slbuf->windex - slbuf->rindex >= min_msg_size) {
407 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
408 if (head->sm_bytes < min_msg_size) {
412 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
417 if ((slbuf->rindex & slbuf->bufmask) >
418 ((slbuf->rindex + aligned_reclen) & slbuf->bufmask)
425 * Insufficient data read
427 if (slbuf->windex - slbuf->rindex < aligned_reclen)
431 * Process non-pad messages. Non-pad messages have to be at
432 * least the size of the syslink_msg structure.
434 * A PAD message's sm_cmd field contains 0.
437 if (head->sm_bytes < sizeof(struct syslink_msg)) {
441 error = process_syslink_msg(sldata, head);
446 slbuf->rindex += aligned_reclen;
453 * Mark us as done and deref sldata. Tell the writer to terminate as
456 sldata->flags |= SLF_RDONE;
457 if ((sldata->flags & SLF_WDONE) == 0) {
458 sldata->flags |= SLF_WQUIT;
459 wakeup(&sldata->wbuf);
461 wakeup(&sldata->rbuf);
462 wakeup(&sldata->wbuf);
467 * This thread takes outgoing syslink messages queued to wbuf and writes them
468 * to the descriptor. PAD is stripped. PAD is also added as required to
469 * conform to the outgoing descriptor's buffering requirements.
473 syslink_wthread(void *arg)
475 struct sldata *sldata = arg;
476 struct slbuf *slbuf = &sldata->wbuf;
477 struct syslink_msg *head;
480 while ((sldata->flags & SLF_WQUIT) == 0) {
487 used = slbuf->windex - slbuf->rindex;
488 if (used < SL_MIN_MESSAGE_SIZE)
491 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
492 if (head->sm_bytes < SL_MIN_MESSAGE_SIZE) {
496 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
501 if ((slbuf->rindex & slbuf->bufmask) >
502 ((slbuf->rindex + aligned_reclen) & slbuf->bufmask)
509 * Insufficient data read
511 if (used < aligned_reclen)
515 * Write it out whether it is PAD or not. XXX re-PAD for output
518 error = fp_write(sldata->xfp, head, aligned_reclen, &count,
522 if (count != aligned_reclen) {
526 slbuf->rindex += aligned_reclen;
530 tsleep(slbuf, 0, "fifowt", 0);
532 sldata->flags |= SLF_WDONE;
538 slbuf_alloc(struct slbuf *slbuf, int bytes)
540 bzero(slbuf, sizeof(*slbuf));
541 slbuf->buf = kmalloc(bytes, M_SYSLINK, M_WAITOK);
542 slbuf->bufsize = bytes;
543 slbuf->bufmask = bytes - 1;
548 slbuf_free(struct slbuf *slbuf)
550 kfree(slbuf->buf, M_SYSLINK);
556 sldata_rels(struct sldata *sldata)
558 struct slrouter *slrouter;
560 if (--sldata->refs == 0) {
561 slrouter = sldata->router;
562 KKASSERT(slrouter != NULL);
564 RB_REMOVE(sldata_rb_tree, &sldata->router->sldata_rb_root, sldata);
565 sldata->router = NULL;
566 slbuf_free(&sldata->rbuf);
567 slbuf_free(&sldata->wbuf);
568 kfree(sldata, M_SYSLINK);
569 slrouter_rels(slrouter);
575 slrouter_rels(struct slrouter *slrouter)
577 if (--slrouter->refs == 0 && RB_EMPTY(&slrouter->sldata_rb_root)) {
578 RB_REMOVE(slrouter_rb_tree, &slrouter_rb_root, slrouter);
579 kfree(slrouter, M_SYSLINK);
584 * fileops for an established syslink when the kernel is asked to create a
585 * descriptor (verses one being handed to it). No threads are created in
590 * Transfer zero or more messages from the kernel to userland. Only complete
591 * messages are returned. If the uio has insufficient space then EMSGSIZE
592 * is returned. The kernel feeds messages to wbuf so we use wlock (structures
593 * are relative to the kernel).
597 syslink_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
599 struct sldata *sldata = fp->f_data;
600 struct slbuf *slbuf = &sldata->wbuf;
601 struct syslink_msg *head;
607 if (flags & O_FBLOCKING)
609 else if (flags & O_FNONBLOCKING)
611 else if (fp->f_flag & O_NONBLOCK)
616 lockmgr(&sldata->wlock, LK_EXCLUSIVE | LK_RETRY);
619 * Calculate the number of bytes we can transfer in one shot. Transfers
620 * do not wrap the FIFO.
622 contig = slbuf->bufsize - (slbuf->rindex & slbuf->bufmask);
624 bytes = slbuf->windex - slbuf->rindex;
627 if (sldata->flags & SLF_RDONE) {
635 tsleep(slbuf, 0, "fiford", 0);
641 * The uio must be able to accomodate the transfer.
643 if (uio->uio_resid < bytes) {
649 * Copy the data to userland and update rindex.
651 head = (void *)(slbuf->buf + (slbuf->rindex & slbuf->bufmask));
652 error = uiomove((caddr_t)head, bytes, uio);
654 slbuf->rindex += bytes;
660 lockmgr(&sldata->wlock, LK_RELEASE);
665 * Transfer zero or more messages from userland to the kernel. Only complete
666 * messages may be written. The kernel processes from rbuf so that is where
667 * we have to copy the messages.
671 syslink_write (struct file *fp, struct uio *uio, struct ucred *cred, int flags)
673 struct sldata *sldata = fp->f_data;
674 struct slbuf *slbuf = &sldata->rbuf;
675 struct syslink_msg *head;
681 if (flags & O_FBLOCKING)
683 else if (flags & O_FNONBLOCKING)
685 else if (fp->f_flag & O_NONBLOCK)
690 lockmgr(&sldata->rlock, LK_EXCLUSIVE | LK_RETRY);
693 * Calculate the maximum number of contiguous bytes that may be available.
694 * Caller is required to not wrap our FIFO.
696 contig = slbuf->bufsize - (slbuf->windex & slbuf->bufmask);
697 if (uio->uio_resid > contig) {
703 * Truncate based on actual unused space available in the FIFO. If
704 * the uio does not fit, block and loop.
707 bytes = slbuf->bufsize - (slbuf->windex - slbuf->rindex);
710 if (uio->uio_resid <= bytes)
712 if (sldata->flags & SLF_RDONE) {
720 tsleep(slbuf, 0, "fifowr", 0);
722 bytes = uio->uio_resid;
723 head = (void *)(slbuf->buf + (slbuf->windex & slbuf->bufmask));
724 error = uiomove((caddr_t)head, bytes, uio);
726 error = syslink_validate(head, bytes);
728 slbuf->windex += bytes;
732 lockmgr(&sldata->rlock, LK_RELEASE);
738 syslink_close (struct file *fp)
740 struct sldata *sldata;
743 if ((sldata->flags & SLF_RQUIT) == 0) {
744 sldata->flags |= SLF_RQUIT;
745 wakeup(&sldata->rbuf);
747 if ((sldata->flags & SLF_WQUIT) == 0) {
748 sldata->flags |= SLF_WQUIT;
749 wakeup(&sldata->wbuf);
758 syslink_stat (struct file *fp, struct stat *sb, struct ucred *cred)
765 syslink_shutdown (struct file *fp, int how)
772 syslink_ioctl (struct file *fp, u_long cmd, caddr_t data, struct ucred *cred)
779 syslink_poll (struct file *fp, int events, struct ucred *cred)
786 syslink_kqfilter(struct file *fp, struct knote *kn)
792 * This routine is called from a route node's reader thread to process a
793 * syslink message once it has been completely read and its size validated.
797 process_syslink_msg(struct sldata *sldata, struct syslink_msg *head)
799 kprintf("process syslink msg %08x\n", head->sm_cmd);
804 * Validate that the syslink message header(s) are correctly sized.
808 syslink_validate(struct syslink_msg *head, int bytes)
810 const int min_msg_size = SL_MIN_MESSAGE_SIZE;
815 * Message size and alignment
817 if (bytes < min_msg_size)
819 if (bytes & SL_ALIGNMASK)
821 if (head->sm_cmd && bytes < sizeof(struct syslink_msg))
825 * Buffer must contain entire record
827 aligned_reclen = SLMSG_ALIGN(head->sm_bytes);
828 if (bytes < aligned_reclen)
830 bytes -= aligned_reclen;
831 head = (void *)((char *)head + aligned_reclen);