2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 #include <sys/xdiskioctl.h>
39 #include <machine/atomic.h>
41 struct hammer2_media_config {
42 hammer2_volconf_t copy_run;
43 hammer2_volconf_t copy_pend;
48 int pipefd[2]; /* signal stop */
50 pthread_t iocom_thread;
51 enum { H2MC_STOPPED, H2MC_CONNECT, H2MC_RUNNING } state;
54 typedef struct hammer2_media_config hammer2_media_config_t;
56 #define H2CONFCTL_STOP 0x00000001
57 #define H2CONFCTL_UPDATE 0x00000002
60 TAILQ_ENTRY(diskcon) entry;
64 struct service_node_opaque {
67 dmsg_media_block_t block;
74 TAILQ_ENTRY(autoconn) entry;
78 int pipefd[2]; /* {read,write} */
79 enum { AUTOCONN_INACTIVE, AUTOCONN_ACTIVE } state;
85 TAILQ_HEAD(, diskcon) diskconq = TAILQ_HEAD_INITIALIZER(diskconq);
86 static pthread_mutex_t diskmtx;
87 static pthread_mutex_t confmtx;
89 static void *service_thread(void *data);
90 static void *udev_thread(void *data);
91 static void *autoconn_thread(void *data);
92 static void master_reconnect(const char *mntpt);
93 static void disk_reconnect(const char *disk);
94 static void disk_disconnect(void *handle);
95 static void udev_check_disks(void);
96 static void hammer2_usrmsg_handler(dmsg_msg_t *msg, int unmanaged);
97 static void *hammer2_volconf_thread(void *info);
98 static void hammer2_volconf_signal(dmsg_iocom_t *iocom);
99 static void hammer2_volconf_start(hammer2_media_config_t *conf,
100 const char *hostname);
101 static void hammer2_volconf_stop(hammer2_media_config_t *conf);
104 static void xdisk_connect(void);
107 * Start-up the master listener daemon for the machine. This daemon runs
108 * a UDP discovery protocol, a TCP rendezvous, and scans certain files
109 * and directories for work.
113 * The only purpose for the UDP discovery protocol is to determine what
114 * other IPs on the LAN are running the hammer2 service daemon. DNS is not
115 * required to operate, but hostnames (if assigned) must be unique. If
116 * no hostname is assigned the host's IP is used as the name. This name
117 * is broadcast along with the mtime of the originator's private key.
119 * Receiving hammer2 service daemons which are able to match the label against
120 * /etc/hammer2/remote/<label>.pub will initiate a persistent connection
121 * to the target. Removal of the file will cause a disconnection. A failed
122 * public key negotiation stops further connection attempts until either the
123 * file is updated or the remote mtime is updated.
125 * Generally speaking this results in a web of connections, typically a
126 * combination of point-to-point for the more important links and relayed
127 * (spanning tree) for less important or filtered links.
131 * The TCP listener serves as a rendezvous point in the cluster, accepting
132 * connections, performing registrations and authentications, maintaining
133 * the spanning tree, and keeping track of message state so disconnects can
134 * be handled properly.
136 * Once authenticated only low-level messaging protocols (which includes
137 * tracking persistent messages) are handled by this daemon. This daemon
138 * does not run the higher level quorum or locking protocols.
142 * The file /etc/hammer2/autoconn, if it exists, contains a list of targets
143 * to connect to (which do not have to be on the local lan). This list will
144 * be retried until a connection can be established. The file is not usually
145 * needed for linkages local to the LAN.
150 struct sockaddr_in lsin;
155 * Acquire socket and set options
157 if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
158 fprintf(stderr, "master_listen: socket(): %s\n",
163 setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
166 * Setup listen port and try to bind. If the bind fails we assume
167 * that a master listener process is already running and silently
170 bzero(&lsin, sizeof(lsin));
171 lsin.sin_family = AF_INET;
172 lsin.sin_addr.s_addr = INADDR_ANY;
173 lsin.sin_port = htons(DMSG_LISTEN_PORT);
174 if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
178 "master listen: daemon already running\n");
183 fprintf(stderr, "master listen: startup\n");
187 * Fork and disconnect the controlling terminal and parent process,
188 * executing the specified function as a pthread.
190 * Returns to the original process which can then continue running.
191 * In debug mode this call will create the pthread without forking
192 * and set NormalExit to 0, instead of fork.
194 hammer2_demon(service_thread, (void *)(intptr_t)lfd);
201 * Master listen/accept thread. Accept connections on the master socket,
202 * starting a pthread for each one.
206 service_thread(void *data)
208 struct sockaddr_in asin;
211 dmsg_master_service_info_t *info;
212 int lfd = (int)(intptr_t)data;
217 struct statfs *mntbuf = NULL;
218 struct statvfs *mntvbuf = NULL;
221 * Nobody waits for us
223 setproctitle("hammer2 master listen");
224 pthread_detach(pthread_self());
227 * Start up a thread to handle block device monitoring for
228 * export to the cluster.
231 pthread_create(&thread, NULL, udev_thread, NULL);
234 * Start up a thread to tie /dev/xdisk into the cluster
240 * Start thread to manage /etc/hammer2/autoconn
243 pthread_create(&thread, NULL, autoconn_thread, NULL);
246 * Scan existing hammer2 mounts and reconnect to them using
247 * HAMMER2IOC_RECLUSTER.
249 count = getmntvinfo(&mntbuf, &mntvbuf, MNT_NOWAIT);
250 for (i = 0; i < count; ++i) {
251 if (strcmp(mntbuf[i].f_fstypename, "hammer2") == 0)
252 master_reconnect(mntbuf[i].f_mntonname);
256 * Accept connections and create pthreads to handle them after
261 fd = accept(lfd, (struct sockaddr *)&asin, &alen);
268 setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof opt);
270 fprintf(stderr, "service_thread: accept fd %d\n", fd);
271 info = malloc(sizeof(*info));
272 bzero(info, sizeof(*info));
275 info->usrmsg_callback = hammer2_usrmsg_handler;
276 info->label = strdup("client");
277 pthread_create(&thread, NULL, dmsg_master_service, info);
283 * Handle/Monitor the dmsg stream. If unmanaged is set we are responsible
284 * for responding for the message, otherwise if it is not set libdmsg has
285 * already done some preprocessing and will respond to the message for us
288 * We primarily monitor for VOLCONFs
292 hammer2_usrmsg_handler(dmsg_msg_t *msg, int unmanaged)
295 hammer2_media_config_t *conf;
296 dmsg_lnk_hammer2_volconf_t *msgconf;
300 * Only process messages which are part of a LNK_CONN stream
304 (state->rxcmd & DMSGF_BASECMDMASK) != DMSG_LNK_CONN) {
305 hammer2_shell_parse(msg, unmanaged);
310 case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
311 case DMSG_LNK_CONN | DMSGF_DELETE:
312 case DMSG_LNK_ERROR | DMSGF_DELETE:
314 * Deleting connection, clean out all volume configs
316 if (state->media == NULL || state->media->usrhandle == NULL)
318 conf = state->media->usrhandle;
319 fprintf(stderr, "Shutting down media spans\n");
320 for (i = 0; i < HAMMER2_COPYID_COUNT; ++i) {
321 if (conf[i].thread) {
322 conf[i].ctl = H2CONFCTL_STOP;
323 pthread_cond_signal(&conf[i].cond);
326 for (i = 0; i < HAMMER2_COPYID_COUNT; ++i) {
327 if (conf[i].thread) {
328 pthread_join(conf[i].thread, NULL);
330 pthread_cond_destroy(&conf[i].cond);
333 state->media->usrhandle = NULL;
336 case DMSG_LNK_HAMMER2_VOLCONF:
338 * One-way volume-configuration message is transmitted
339 * over the open LNK_CONN transaction.
341 fprintf(stderr, "RECEIVED VOLCONF\n");
343 if ((conf = state->media->usrhandle) == NULL) {
344 conf = malloc(sizeof(*conf) * HAMMER2_COPYID_COUNT);
345 bzero(conf, sizeof(*conf) * HAMMER2_COPYID_COUNT);
346 state->media->usrhandle = conf;
348 msgconf = H2_LNK_VOLCONF(msg);
350 if (msgconf->index < 0 ||
351 msgconf->index >= HAMMER2_COPYID_COUNT) {
353 "VOLCONF: ILLEGAL INDEX %d\n",
357 if (msgconf->copy.path[sizeof(msgconf->copy.path) - 1] != 0 ||
358 msgconf->copy.path[0] == 0) {
360 "VOLCONF: ILLEGAL PATH %d\n",
364 conf += msgconf->index;
365 pthread_mutex_lock(&confmtx);
366 conf->copy_pend = msgconf->copy;
367 conf->ctl |= H2CONFCTL_UPDATE;
368 pthread_mutex_unlock(&confmtx);
369 if (conf->thread == NULL) {
370 fprintf(stderr, "VOLCONF THREAD STARTED\n");
371 pthread_cond_init(&conf->cond, NULL);
372 pthread_create(&conf->thread, NULL,
373 hammer2_volconf_thread, (void *)conf);
375 pthread_cond_signal(&conf->cond);
379 dmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
385 hammer2_volconf_thread(void *info)
387 hammer2_media_config_t *conf = info;
389 pthread_mutex_lock(&confmtx);
390 while ((conf->ctl & H2CONFCTL_STOP) == 0) {
391 if (conf->ctl & H2CONFCTL_UPDATE) {
392 fprintf(stderr, "VOLCONF UPDATE\n");
393 conf->ctl &= ~H2CONFCTL_UPDATE;
394 if (bcmp(&conf->copy_run, &conf->copy_pend,
395 sizeof(conf->copy_run)) == 0) {
396 fprintf(stderr, "VOLCONF: no changes\n");
400 * XXX TODO - auto reconnect on lookup failure or
401 * connect failure or stream failure.
404 pthread_mutex_unlock(&confmtx);
405 hammer2_volconf_stop(conf);
406 conf->copy_run = conf->copy_pend;
407 if (conf->copy_run.copyid != 0 &&
408 strncmp(conf->copy_run.path, "span:", 5) == 0) {
409 hammer2_volconf_start(conf,
410 conf->copy_run.path + 5);
412 pthread_mutex_lock(&confmtx);
413 fprintf(stderr, "VOLCONF UPDATE DONE state %d\n", conf->state);
415 if (conf->state == H2MC_CONNECT) {
416 hammer2_volconf_start(conf, conf->copy_run.path + 5);
417 pthread_mutex_unlock(&confmtx);
419 pthread_mutex_lock(&confmtx);
421 pthread_cond_wait(&conf->cond, &confmtx);
424 pthread_mutex_unlock(&confmtx);
425 hammer2_volconf_stop(conf);
431 hammer2_volconf_start(hammer2_media_config_t *conf, const char *hostname)
433 dmsg_master_service_info_t *info;
435 switch(conf->state) {
438 conf->fd = dmsg_connect(hostname);
440 fprintf(stderr, "Unable to connect to %s\n", hostname);
441 conf->state = H2MC_CONNECT;
442 } else if (pipe(conf->pipefd) < 0) {
444 fprintf(stderr, "pipe() failed during volconf\n");
445 conf->state = H2MC_CONNECT;
447 fprintf(stderr, "VOLCONF CONNECT\n");
448 info = malloc(sizeof(*info));
449 bzero(info, sizeof(*info));
451 info->altfd = conf->pipefd[0];
452 info->altmsg_callback = hammer2_volconf_signal;
453 info->usrmsg_callback = hammer2_usrmsg_handler;
455 conf->state = H2MC_RUNNING;
456 pthread_create(&conf->iocom_thread, NULL,
457 dmsg_master_service, info);
467 hammer2_volconf_stop(hammer2_media_config_t *conf)
469 switch(conf->state) {
473 conf->state = H2MC_STOPPED;
476 close(conf->pipefd[1]);
477 conf->pipefd[1] = -1;
478 pthread_join(conf->iocom_thread, NULL);
479 conf->iocom_thread = NULL;
480 conf->state = H2MC_STOPPED;
487 hammer2_volconf_signal(dmsg_iocom_t *iocom)
489 atomic_set_int(&iocom->flags, DMSG_IOCOMF_EOF);
493 * Monitor block devices. Currently polls every ~10 seconds or so.
497 udev_thread(void *data __unused)
502 pthread_detach(pthread_self());
504 if ((fd = open(UDEV_DEVICE_PATH, O_RDWR)) < 0) {
505 fprintf(stderr, "udev_thread: unable to open \"%s\"\n",
510 while (ioctl(fd, UDEVWAIT, &seq) == 0) {
517 static void *autoconn_connect_thread(void *data);
518 static void autoconn_disconnect_signal(dmsg_iocom_t *iocom);
522 autoconn_thread(void *data __unused)
524 TAILQ_HEAD(, autoconn) autolist;
526 struct autoconn *next;
535 TAILQ_INIT(&autolist);
539 pthread_detach(pthread_self());
547 * Poll the file. Loop up if the synchronized state (lmod)
550 if (stat(HAMMER2_DEFAULT_DIR "/autoconn", &st) == 0) {
551 if (lmod == st.st_mtime)
553 fp = fopen(HAMMER2_DEFAULT_DIR "/autoconn", "r");
563 * Wait at least 5 seconds after the file is created or
566 * Do not update the synchronized state.
568 if (fp == NULL && found_last) {
571 } else if (fp && found_last == 0) {
578 * Don't scan the file until the time progresses past the
579 * file's mtime, so we can validate that the file was not
580 * further modified during our scan.
582 * Do not update the synchronized state.
586 if (t == st.st_mtime) {
596 * Set staging to disconnect, then scan the file.
598 TAILQ_FOREACH(ac, &autolist, entry)
600 while (fp && fgets(buf, sizeof(buf), fp) != NULL) {
603 if ((host = strtok(buf, " \t\r\n")) == NULL ||
607 TAILQ_FOREACH(ac, &autolist, entry) {
608 if (strcmp(host, ac->host) == 0)
612 ac = malloc(sizeof(*ac));
613 bzero(ac, sizeof(*ac));
614 ac->host = strdup(host);
615 ac->state = AUTOCONN_INACTIVE;
616 TAILQ_INSERT_TAIL(&autolist, ac, entry);
622 * Ignore the scan (and retry again) if the file was
623 * modified during the scan.
625 * Do not update the synchronized state.
628 if (fstat(fileno(fp), &st) < 0) {
633 if (t != st.st_mtime)
638 * Update the synchronized state and reconfigure the
639 * connect list as needed.
642 next = TAILQ_FIRST(&autolist);
643 while ((ac = next) != NULL) {
644 next = TAILQ_NEXT(ac, entry);
649 if (ac->stage && ac->state == AUTOCONN_INACTIVE) {
650 if (pipe(ac->pipefd) == 0) {
652 ac->state = AUTOCONN_ACTIVE;
654 pthread_create(&thread, NULL,
655 autoconn_connect_thread,
661 * Unstaging, stop active connection.
663 * We write to the pipe which causes the iocom_core
664 * to call autoconn_disconnect_signal().
666 if (ac->stage == 0 &&
667 ac->state == AUTOCONN_ACTIVE) {
668 if (ac->stopme == 0) {
671 write(ac->pipefd[1], &dummy, 1);
676 * Unstaging, delete inactive connection.
678 if (ac->stage == 0 &&
679 ac->state == AUTOCONN_INACTIVE) {
680 TAILQ_REMOVE(&autolist, ac, entry);
693 autoconn_connect_thread(void *data)
695 dmsg_master_service_info_t *info;
701 pthread_detach(pthread_self());
703 while (ac->stopme == 0) {
704 fd = dmsg_connect(ac->host);
706 if (DMsgDebugOpt > 2) {
708 "autoconn: Connect failure: %s\n",
714 fprintf(stderr, "autoconn: Connect %s\n", ac->host);
716 info = malloc(sizeof(*info));
717 bzero(info, sizeof(*info));
719 info->altfd = ac->pipefd[0];
720 info->altmsg_callback = autoconn_disconnect_signal;
721 info->usrmsg_callback = hammer2_usrmsg_handler;
723 info->noclosealt = 1;
724 pthread_create(&ac->thread, NULL, dmsg_master_service, info);
725 pthread_join(ac->thread, &res);
727 close(ac->pipefd[0]);
728 ac->state = AUTOCONN_INACTIVE;
729 /* auto structure can be ripped out here */
735 autoconn_disconnect_signal(dmsg_iocom_t *iocom)
737 fprintf(stderr, "autoconn: Shutting down socket\n");
738 atomic_set_int(&iocom->flags, DMSG_IOCOMF_EOF);
742 * Retrieve the list of disk attachments and attempt to export
747 udev_check_disks(void)
757 error = sysctlbyname("kern.disks", NULL, &n, NULL, 0);
758 if (error < 0 || n == 0)
760 if (n >= sizeof(tmpbuf))
764 error = sysctlbyname("kern.disks", buf, &n, NULL, 0);
777 fprintf(stderr, "DISKS: %s\n", buf);
778 for (disk = strtok(buf, WS); disk; disk = strtok(NULL, WS)) {
779 disk_reconnect(disk);
787 * Normally the mount program supplies a cluster communications
788 * descriptor to the hammer2 vfs on mount, but if you kill the service
789 * daemon and restart it that link will be lost.
791 * This procedure attempts to [re]connect to existing mounts when
792 * the service daemon is started up before going into its accept
795 * NOTE: A hammer2 mount point can only accomodate one connection at a time
796 * so this will disconnect any existing connection during the
801 master_reconnect(const char *mntpt)
803 struct hammer2_ioc_recluster recls;
804 dmsg_master_service_info_t *info;
809 fd = open(mntpt, O_RDONLY);
811 fprintf(stderr, "reconnect %s: no access to mount\n", mntpt);
814 if (pipe(pipefds) < 0) {
815 fprintf(stderr, "reconnect %s: pipe() failed\n", mntpt);
819 bzero(&recls, sizeof(recls));
820 recls.fd = pipefds[0];
821 if (ioctl(fd, HAMMER2IOC_RECLUSTER, &recls) < 0) {
822 fprintf(stderr, "reconnect %s: ioctl failed\n", mntpt);
831 info = malloc(sizeof(*info));
832 bzero(info, sizeof(*info));
833 info->fd = pipefds[1];
835 info->usrmsg_callback = hammer2_usrmsg_handler;
836 info->label = strdup("hammer2");
837 pthread_create(&thread, NULL, dmsg_master_service, info);
841 * Reconnect a physical disk service to the mesh.
845 disk_reconnect(const char *disk)
847 struct disk_ioc_recluster recls;
849 dmsg_master_service_info_t *info;
856 * Urm, this will auto-create mdX+1, just ignore for now.
857 * This mechanic needs to be fixed. It might actually be nice
858 * to be able to export md disks.
860 if (strncmp(disk, "md", 2) == 0)
862 if (strncmp(disk, "xa", 2) == 0)
866 * Check if already connected
868 pthread_mutex_lock(&diskmtx);
869 TAILQ_FOREACH(dc, &diskconq, entry) {
870 if (strcmp(dc->disk, disk) == 0)
873 pthread_mutex_unlock(&diskmtx);
878 * Not already connected, create a connection to the kernel
881 asprintf(&path, "/dev/%s", disk);
882 fd = open(path, O_RDONLY);
884 fprintf(stderr, "reconnect %s: no access to disk\n", disk);
889 if (pipe(pipefds) < 0) {
890 fprintf(stderr, "reconnect %s: pipe() failed\n", disk);
894 bzero(&recls, sizeof(recls));
895 recls.fd = pipefds[0];
896 if (ioctl(fd, DIOCRECLUSTER, &recls) < 0) {
897 fprintf(stderr, "reconnect %s: ioctl failed\n", disk);
906 dc = malloc(sizeof(*dc));
907 dc->disk = strdup(disk);
908 pthread_mutex_lock(&diskmtx);
909 TAILQ_INSERT_TAIL(&diskconq, dc, entry);
910 pthread_mutex_unlock(&diskmtx);
912 info = malloc(sizeof(*info));
913 bzero(info, sizeof(*info));
914 info->fd = pipefds[1];
916 info->usrmsg_callback = hammer2_usrmsg_handler;
917 info->exit_callback = disk_disconnect;
919 info->label = strdup(dc->disk);
920 pthread_create(&thread, NULL, dmsg_master_service, info);
925 disk_disconnect(void *handle)
927 struct diskcon *dc = handle;
929 fprintf(stderr, "DISK_DISCONNECT %s\n", dc->disk);
931 pthread_mutex_lock(&diskmtx);
932 TAILQ_REMOVE(&diskconq, dc, entry);
933 pthread_mutex_unlock(&diskmtx);
939 * Connect our cluster controller to /dev/xdisk. xdisk will pick up
940 * SPAN messages that we route to it, makes remote block devices
941 * available to the host, and can issue dmsg transactions based on
948 dmsg_master_service_info_t *info;
949 struct xdisk_attach_ioctl xaioc;
956 * Is /dev/xdisk available?
958 xfd = open("/dev/xdisk", O_RDWR, 0600);
960 fprintf(stderr, "xdisk_connect: Unable to open /dev/xdisk\n");
964 if (pipe(pipefds) < 0) {
965 fprintf(stderr, "xdisk_connect: pipe() failed\n");
970 * Pipe between cluster controller (this user process).
972 info = malloc(sizeof(*info));
973 bzero(info, sizeof(*info));
974 info->fd = pipefds[1];
976 info->usrmsg_callback = hammer2_usrmsg_handler;
977 info->exit_callback = NULL;
978 pthread_create(&thread, NULL, dmsg_master_service, info);
981 * And the xdisk device.
983 bzero(&xaioc, sizeof(xaioc));
984 xaioc.fd = pipefds[0];
985 error = ioctl(xfd, XDISKIOCATTACH, &xaioc);
991 "xdisk_connect: cannot attach %s\n",