hammer2 - Add peer_type field to LNK_CONN and LNK_SPAN
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 12 Aug 2012 02:32:03 +0000 (19:32 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 12 Aug 2012 02:32:03 +0000 (19:32 -0700)
* Adds a peer_type field allowing connections to identify what they are
  (i.e. a HAMMER2 mount, a CLUSTER controller, a BLOCK device controller).

  Rename the pfs_type field in the volume header, which never made much
  sense, to peer_type, which now does make sense though for the moment
  the only value it can have is HAMER2_PEER_HAMMER2.

* Filter HAMMER2_PEER_HAMMER2 peer types by pfs_clid to reduce unnecessary
  LNK_SPAN traffic being transmitted to a HAMMER2 mount (in the kernel).

* Minor cleanup.

sbin/hammer2/cmd_service.c
sbin/hammer2/msg_lnk.c
sbin/hammer2/network.h
sbin/newfs_hammer2/newfs_hammer2.c
sys/vfs/hammer2/hammer2_disk.h
sys/vfs/hammer2/hammer2_network.h
sys/vfs/hammer2/hammer2_vfsops.c

index 2819b05..0fd3ea7 100644 (file)
@@ -200,6 +200,7 @@ master_reconnect(const char *mntpt)
        }
        if (pipe(pipefds) < 0) {
                fprintf(stderr, "reconnect %s: pipe() failed\n", mntpt);
+               close(fd);
                return;
        }
        bzero(&recls, sizeof(recls));
@@ -212,6 +213,7 @@ master_reconnect(const char *mntpt)
                return;
        }
        close(pipefds[0]);
+       close(fd);
 
        info = malloc(sizeof(*info));
        bzero(info, sizeof(*info));
@@ -272,12 +274,14 @@ master_auth_signal(hammer2_router_t *router)
         * Transmit LNK_CONN, enabling the SPAN protocol if both sides
         * agree.
         *
-        * XXX put additional authentication states here
+        * XXX put additional authentication states here?
         */
        msg = hammer2_msg_alloc(router, 0, HAMMER2_LNK_CONN |
-                                                  HAMMER2_MSGF_CREATE,
+                                          HAMMER2_MSGF_CREATE,
                                master_auth_conn_rx, NULL);
-       snprintf(msg->any.lnk_conn.label, sizeof(msg->any.lnk_conn.label), "*");
+       msg->any.lnk_conn.peer_mask = (uint64_t)-1;
+       msg->any.lnk_conn.peer_type = HAMMER2_PEER_CLUSTER;
+
        hammer2_msg_write(msg);
 
        hammer2_router_restate(router,
index cb5afb7..9e8d195 100644 (file)
  * outgoing LNK_SPAN transactions on each of our connections representing
  * the aggregated state.
  *
- * h2span_connect      - list of iocom connections who wish to receive SPAN
+ * h2span_conn         - list of iocom connections who wish to receive SPAN
  *                       propagation from other connections.  Might contain
  *                       a filter string.  Only iocom's with an open
  *                       LNK_CONN transactions are applicable for SPAN
 struct h2span_link;
 struct h2span_relay;
 TAILQ_HEAD(h2span_media_queue, h2span_media);
-TAILQ_HEAD(h2span_connect_queue, h2span_connect);
+TAILQ_HEAD(h2span_conn_queue, h2span_conn);
 TAILQ_HEAD(h2span_relay_queue, h2span_relay);
 
 RB_HEAD(h2span_cluster_tree, h2span_cluster);
@@ -224,8 +224,8 @@ typedef struct h2span_media_config h2span_media_config_t;
  * (may contain filter).  Typically one for each mount and several may
  * share the same media.
  */
-struct h2span_connect {
-       TAILQ_ENTRY(h2span_connect) entry;
+struct h2span_conn {
+       TAILQ_ENTRY(h2span_conn) entry;
        struct h2span_relay_tree tree;
        struct h2span_media *media;
        hammer2_state_t *state;
@@ -267,7 +267,7 @@ struct h2span_link {
  * In many respects this is the core of the protocol... actually figuring
  * out what LNK_SPANs to relay.  The spanid used for relaying is the
  * address of the 'state' structure, which is why h2span_relay has to
- * be entered into a RB-TREE based at h2span_connect (so we can look
+ * be entered into a RB-TREE based at h2span_conn (so we can look
  * up the spanid to validate it).
  *
  * NOTE: Messages can be received via the LNK_SPAN transaction the
@@ -279,9 +279,9 @@ struct h2span_link {
  *      transaction the relay is holding open.
  */
 struct h2span_relay {
-       RB_ENTRY(h2span_relay) rbnode;  /* from h2span_connect */
+       RB_ENTRY(h2span_relay) rbnode;  /* from h2span_conn */
        TAILQ_ENTRY(h2span_relay) entry; /* from link */
-       struct h2span_connect *conn;
+       struct h2span_conn *conn;
        hammer2_state_t *state;         /* transmitted LNK_SPAN */
        struct h2span_link *link;       /* LNK_SPAN being relayed */
        struct hammer2_router   *router;/* route out this relay */
@@ -289,7 +289,7 @@ struct h2span_relay {
 
 
 typedef struct h2span_media h2span_media_t;
-typedef struct h2span_connect h2span_connect_t;
+typedef struct h2span_conn h2span_conn_t;
 typedef struct h2span_cluster h2span_cluster_t;
 typedef struct h2span_node h2span_node_t;
 typedef struct h2span_link h2span_link_t;
@@ -396,13 +396,13 @@ RB_GENERATE_STATIC(h2span_relay_tree, h2span_relay,
  */
 static pthread_mutex_t cluster_mtx;
 static struct h2span_cluster_tree cluster_tree = RB_INITIALIZER(cluster_tree);
-static struct h2span_connect_queue connq = TAILQ_HEAD_INITIALIZER(connq);
+static struct h2span_conn_queue connq = TAILQ_HEAD_INITIALIZER(connq);
 static struct h2span_media_queue mediaq = TAILQ_HEAD_INITIALIZER(mediaq);
 
 static void hammer2_lnk_span(hammer2_msg_t *msg);
 static void hammer2_lnk_conn(hammer2_msg_t *msg);
 static void hammer2_lnk_relay(hammer2_msg_t *msg);
-static void hammer2_relay_scan(h2span_connect_t *conn, h2span_node_t *node);
+static void hammer2_relay_scan(h2span_conn_t *conn, h2span_node_t *node);
 static void hammer2_relay_delete(h2span_relay_t *relay);
 
 static void *hammer2_volconf_thread(void *info);
@@ -448,7 +448,7 @@ hammer2_lnk_conn(hammer2_msg_t *msg)
        hammer2_state_t *state = msg->state;
        h2span_media_t *media;
        h2span_media_config_t *conf;
-       h2span_connect_t *conn;
+       h2span_conn_t *conn;
        h2span_relay_t *relay;
        char *alloc = NULL;
        int i;
@@ -459,7 +459,7 @@ hammer2_lnk_conn(hammer2_msg_t *msg)
        case HAMMER2_LNK_CONN | HAMMER2_MSGF_CREATE:
        case HAMMER2_LNK_CONN | HAMMER2_MSGF_CREATE | HAMMER2_MSGF_DELETE:
                /*
-                * On transaction start we allocate a new h2span_connect and
+                * On transaction start we allocate a new h2span_conn and
                 * acknowledge the request, leaving the transaction open.
                 * We then relay priority-selected SPANs.
                 */
@@ -505,7 +505,7 @@ hammer2_lnk_conn(hammer2_msg_t *msg)
        case HAMMER2_LNK_ERROR | HAMMER2_MSGF_DELETE:
 deleteconn:
                /*
-                * On transaction terminate we clean out our h2span_connect
+                * On transaction terminate we clean out our h2span_conn
                 * and acknowledge the request, closing the transaction.
                 */
                fprintf(stderr, "LNK_CONN: Terminated\n");
@@ -805,10 +805,10 @@ hammer2_lnk_relay(hammer2_msg_t *msg)
  * Called with cluster_mtx held.
  */
 static void hammer2_relay_scan_specific(h2span_node_t *node,
-                                       h2span_connect_t *conn);
+                                       h2span_conn_t *conn);
 
 static void
-hammer2_relay_scan(h2span_connect_t *conn, h2span_node_t *node)
+hammer2_relay_scan(h2span_conn_t *conn, h2span_node_t *node)
 {
        h2span_cluster_t *cls;
 
@@ -884,13 +884,16 @@ hammer2_relay_scan_callback(h2span_relay_t *relay, void *arg)
 }
 
 static void
-hammer2_relay_scan_specific(h2span_node_t *node, h2span_connect_t *conn)
+hammer2_relay_scan_specific(h2span_node_t *node, h2span_conn_t *conn)
 {
        struct relay_scan_info info;
        h2span_relay_t *relay;
        h2span_relay_t *next_relay;
        h2span_link_t *slink;
+       hammer2_lnk_conn_t *lconn;
+       hammer2_msg_t *msg;
        int count = 2;
+       uint8_t peer_type;
 
        info.node = node;
        info.relay = NULL;
@@ -912,99 +915,118 @@ hammer2_relay_scan_specific(h2span_node_t *node, h2span_connect_t *conn)
        /*
         * Iterate the node's links (received SPANs) in distance order,
         * lowest (best) dist first.
+        *
+        * PROPAGATE THE BEST LINKS OVER THE SPECIFIED CONNECTION.
+        *
+        * Track relays while iterating the best links and construct
+        * missing relays when necessary.
+        *
+        * (If some prior better link was removed it would have also
+        *  removed the relay, so the relay can only match exactly or
+        *  be worse).
         */
-       /* fprintf(stderr, "LOOP\n"); */
        RB_FOREACH(slink, h2span_link_tree, &node->tree) {
                /*
-               fprintf(stderr, "SLINK %p RELAY %p(%p)\n",
-                       slink, relay, relay ? relay->link : NULL);
-               */
-               /*
-                * PROPAGATE THE BEST LINKS OVER THE SPECIFIED CONNECTION.
-                *
-                * Track relays while iterating the best links and construct
-                * missing relays when necessary.
-                *
-                * (If some prior better link was removed it would have also
-                *  removed the relay, so the relay can only match exactly or
-                *  be worse).
+                * Match, relay already in-place, get the next
+                * relay to match against the next slink.
                 */
                if (relay && relay->link == slink) {
-                       /*
-                        * Match, relay already in-place, get the next
-                        * relay to match against the next slink.
-                        */
                        relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
                        if (--count == 0)
                                break;
-               } else if (slink->dist > HAMMER2_SPAN_MAXDIST) {
-                       /*
-                        * No match but span distance is too great,
-                        * do not relay.  This prevents endless closed
-                        * loops with ever-incrementing distances when
-                        * the seed span is lost in the graph.
-                        *
-                        * All later spans will also be too far away so
-                        * we can break out of the loop.
-                        */
+                       continue;
+               }
+
+               /*
+                * We might want this SLINK, if it passes our filters.
+                *
+                * The spanning tree can cause closed loops so we have
+                * to limit slink->dist.
+                */
+               if (slink->dist > HAMMER2_SPAN_MAXDIST)
                        break;
-               } else if (slink->state->iocom == conn->state->iocom) {
-                       /*
-                        * No match but we would transmit a LNK_SPAN
-                        * out the same connection it came in on, which
-                        * can be trivially optimized out.
-                        */
+
+               /*
+                * Don't bother transmitting a LNK_SPAN out the same
+                * connection it came in on.  Trivial optimization.
+                */
+               if (slink->state->iocom == conn->state->iocom)
                        break;
-               } else {
-                       /*
-                        * No match, distance is ok, construct a new relay.
-                        * (slink is better than relay).
-                        */
-                       hammer2_msg_t *msg;
-
-                       assert(relay == NULL ||
-                              relay->link->node != slink->node ||
-                              relay->link->dist >= slink->dist);
-                       relay = hammer2_alloc(sizeof(*relay));
-                       relay->conn = conn;
-                       relay->link = slink;
-
-                       msg = hammer2_msg_alloc(conn->state->iocom->router, 0,
-                                               HAMMER2_LNK_SPAN |
-                                               HAMMER2_MSGF_CREATE,
-                                               hammer2_lnk_relay, relay);
-                       relay->state = msg->state;
-                       relay->router = hammer2_router_alloc();
-                       relay->router->iocom = relay->state->iocom;
-                       relay->router->relay = relay;
-                       relay->router->target = relay->state->msgid;
-
-                       msg->any.lnk_span = slink->state->msg->any.lnk_span;
-                       msg->any.lnk_span.dist = slink->dist + 1;
-
-                       hammer2_router_connect(relay->router);
-
-                       RB_INSERT(h2span_relay_tree, &conn->tree, relay);
-                       TAILQ_INSERT_TAIL(&slink->relayq, relay, entry);
-
-                       hammer2_msg_write(msg);
-
-                       fprintf(stderr,
-                               "RELAY SPAN %p RELAY %p ON CLS=%p NODE=%p DIST=%d "
-                               "FD %d state %p\n",
-                               slink,
-                               relay,
-                               node->cls, node, slink->dist,
-                               conn->state->iocom->sock_fd, relay->state);
 
-                       /*
-                        * Match (created new relay), get the next relay to
-                        * match against the next slink.
-                        */
-                       relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
-                       if (--count == 0)
+               /*
+                * NOTE ON FILTERS: The protocol spec allows non-requested
+                * SPANs to be transmitted, the other end is expected to
+                * leave their transactions open but otherwise ignore them.
+                *
+                * Don't bother transmitting if the remote connection
+                * is not accepting this SPAN's peer_type.
+                */
+               peer_type = slink->state->msg->any.lnk_span.peer_type;
+               lconn = &conn->state->msg->any.lnk_conn;
+               if (((1LLU << peer_type) & lconn->peer_mask) == 0)
+                       break;
+
+               /*
+                * Filter based on pfs_clid or label (XXX).  This typically
+                * reduces the amount of SPAN traffic that a mount end-point
+                * sees by only passing along SPANs related to the cluster id
+                * (that is, it will see all PFS's associated with the
+                * particular cluster it represents).
+                */
+               if (peer_type == lconn->peer_type &&
+                   peer_type == HAMMER2_PEER_HAMMER2) {
+                       if (!uuid_is_nil(&slink->node->cls->pfs_clid, NULL) &&
+                           uuid_compare(&slink->node->cls->pfs_clid,
+                                        &lconn->pfs_clid, NULL) != 0) {
                                break;
+                       }
                }
+
+               /*
+                * Ok, we've accepted this SPAN for relaying.
+                */
+               assert(relay == NULL ||
+                      relay->link->node != slink->node ||
+                      relay->link->dist >= slink->dist);
+               relay = hammer2_alloc(sizeof(*relay));
+               relay->conn = conn;
+               relay->link = slink;
+
+               msg = hammer2_msg_alloc(conn->state->iocom->router, 0,
+                                       HAMMER2_LNK_SPAN |
+                                       HAMMER2_MSGF_CREATE,
+                                       hammer2_lnk_relay, relay);
+               relay->state = msg->state;
+               relay->router = hammer2_router_alloc();
+               relay->router->iocom = relay->state->iocom;
+               relay->router->relay = relay;
+               relay->router->target = relay->state->msgid;
+
+               msg->any.lnk_span = slink->state->msg->any.lnk_span;
+               msg->any.lnk_span.dist = slink->dist + 1;
+
+               hammer2_router_connect(relay->router);
+
+               RB_INSERT(h2span_relay_tree, &conn->tree, relay);
+               TAILQ_INSERT_TAIL(&slink->relayq, relay, entry);
+
+               hammer2_msg_write(msg);
+
+               fprintf(stderr,
+                       "RELAY SPAN %p RELAY %p ON CLS=%p NODE=%p DIST=%d "
+                       "FD %d state %p\n",
+                       slink,
+                       relay,
+                       node->cls, node, slink->dist,
+                       conn->state->iocom->sock_fd, relay->state);
+
+               /*
+                * Match (created new relay), get the next relay to
+                * match against the next slink.
+                */
+               relay = RB_NEXT(h2span_relay_tree, &conn->tree, relay);
+               if (--count == 0)
+                       break;
        }
 
        /*
index 565d387..9ca48c4 100644 (file)
@@ -129,7 +129,7 @@ RB_HEAD(hammer2_router_tree, hammer2_router);
 
 struct h2span_link;
 struct h2span_relay;
-struct h2span_connect;
+struct h2span_conn;
 
 struct hammer2_state {
        RB_ENTRY(hammer2_state) rbnode;         /* indexed by msgid */
@@ -145,7 +145,7 @@ struct hammer2_state {
        union {
                void *any;
                struct h2span_link *link;
-               struct h2span_connect *conn;
+               struct h2span_conn *conn;
                struct h2span_relay *relay;
        } any;
 };
@@ -172,7 +172,7 @@ int hammer2_state_cmp(hammer2_state_t *state1, hammer2_state_t *state2);
 RB_PROTOTYPE(hammer2_state_tree, hammer2_state, rbnode, hammer2_state_cmp);
 
 /*
- * hammer2_ioq - An embedded component of hammer2_connect, holds state
+ * hammer2_ioq - An embedded component of hammer2_conn, holds state
  * for the buffering and parsing of incoming and outgoing messages.
  *
  * cdx - beg  - processed buffer data, encrypted or decrypted
index 66c757f..322d0b3 100644 (file)
@@ -622,6 +622,8 @@ format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
        vol->fsid = Hammer2_FSId;
        vol->fstype = Hammer2_FSType;
 
+       vol->peer_type = HAMMER2_PEER_HAMMER2;  /* LNK_CONN identification */
+
        vol->allocator_size = free_space;
        vol->allocator_free = free_space;
        vol->allocator_beg = alloc_base;
index 37f7c80..af13fd3 100644 (file)
@@ -507,6 +507,18 @@ typedef struct hammer2_inode_data hammer2_inode_data_t;
 #define HAMMER2_CHECK_NONE             0
 #define HAMMER2_CHECK_ICRC             1
 
+/*
+ * PEER types identify connections and help cluster controller filter
+ * out unwanted SPANs.
+ */
+#define HAMMER2_PEER_NONE              0
+#define HAMMER2_PEER_CLUSTER           1       /* a cluster controller */
+#define HAMMER2_PEER_BLOCK             2       /* block devices */
+#define HAMMER2_PEER_HAMMER2           3       /* hammer2-mounted volumes */
+
+/*
+ * PFS types identify a PFS on media and in LNK_SPAN messages.
+ */
 #define HAMMER2_PFSTYPE_NONE           0
 #define HAMMER2_PFSTYPE_ADMIN          1
 #define HAMMER2_PFSTYPE_CLIENT         2
@@ -703,7 +715,7 @@ struct hammer2_volume_data {
        uint32_t        flags;                  /* 0034 */
        uint8_t         copyid;                 /* 0038 copyid of phys vol */
        uint8_t         freemap_version;        /* 0039 freemap algorithm */
-       uint8_t         pfs_type;               /* 003A local media pfstype */
+       uint8_t         peer_type;              /* 003A HAMMER2_PEER_xxx */
        uint8_t         reserved003B;           /* 003B */
        uint32_t        reserved003C;           /* 003C */
 
index 533e115..8eee05d 100644 (file)
@@ -373,18 +373,39 @@ struct hammer2_lnk_auth {
        char            dummy[64];
 };
 
+/*
+ * LNK_CONN identifies a streaming connection into the cluster.  The other
+ * fields serve as a filter when supported for a particular peer and are
+ * not necessarily all used.
+ *
+ * peer_mask serves to filter the SPANs we receive by peer.  A cluster
+ * controller typically sets this to (uint64_t)-1, a block devfs
+ * interface might set it to 1 << HAMMER2_PEER_DISK, and a hammer2
+ * mount might set it to 1 << HAMMER2_PEER_HAMMER2.
+ *
+ * mediaid allows multiple (e.g. HAMMER2) connections belonging to the same
+ * media, in terms of LNK_VOLCONF updates.
+ *
+ * pfs_clid, pfs_fsid, pfs_type, and label are peer-specific and must be
+ * left empty (zero-fill) if not supported by a particular peer.
+ *
+ * HAMMER2_PEER_CLUSTER                filter: none
+ * HAMMER2_PEER_DISK           filter: label
+ * HAMMER2_PEER_HAMMER2                filter: pfs_clid if not empty, and label
+ */
 struct hammer2_lnk_conn {
        hammer2_msg_hdr_t head;
        uuid_t          mediaid;        /* media configuration id */
        uuid_t          pfs_clid;       /* rendezvous pfs uuid */
        uuid_t          pfs_fsid;       /* unique pfs uuid */
-       uint8_t         pfs_type;       /* peer type */
-       uint8_t         reserved01;
+       uint64_t        peer_mask;      /* PEER mask for SPAN filtering */
+       uint8_t         peer_type;      /* see HAMMER2_PEER_xxx */
+       uint8_t         pfs_type;       /* pfs type */
        uint16_t        proto_version;  /* high level protocol support */
        uint32_t        status;         /* status flags */
        uint8_t         reserved02[8];
        int32_t         dist;           /* span distance */
-       uint32_t        reserved03[15];
+       uint32_t        reserved03[14];
        char            label[256];     /* PFS label (can be wildcard) */
 };
 
@@ -436,8 +457,8 @@ struct hammer2_lnk_span {
        hammer2_msg_hdr_t head;
        uuid_t          pfs_clid;       /* rendezvous pfs uuid */
        uuid_t          pfs_fsid;       /* unique pfs uuid */
-       uint8_t         pfs_type;       /* peer type */
-       uint8_t         reserved01;
+       uint8_t         pfs_type;       /* PFS type */
+       uint8_t         peer_type;      /* PEER type */
        uint16_t        proto_version;  /* high level protocol support */
        uint32_t        status;         /* status flags */
        uint8_t         reserved02[8];
index 21c8652..7046890 100644 (file)
@@ -1241,6 +1241,8 @@ hammer2_cluster_thread_wr(void *arg)
        msg->any.lnk_conn.pfs_fsid = pmp->iroot->ip_data.pfs_fsid;
        msg->any.lnk_conn.pfs_type = pmp->iroot->ip_data.pfs_type;
        msg->any.lnk_conn.proto_version = HAMMER2_SPAN_PROTO_1;
+       msg->any.lnk_conn.peer_type = pmp->hmp->voldata.peer_type;
+       msg->any.lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
        name_len = pmp->iroot->ip_data.name_len;
        if (name_len >= sizeof(msg->any.lnk_conn.label))
                name_len = sizeof(msg->any.lnk_conn.label) - 1;
@@ -1537,6 +1539,7 @@ hammer2_msg_conn_reply(hammer2_state_t *state, hammer2_msg_t *msg)
                rmsg->any.lnk_span.pfs_clid = pmp->iroot->ip_data.pfs_clid;
                rmsg->any.lnk_span.pfs_fsid = pmp->iroot->ip_data.pfs_fsid;
                rmsg->any.lnk_span.pfs_type = pmp->iroot->ip_data.pfs_type;
+               rmsg->any.lnk_span.peer_type = pmp->hmp->voldata.peer_type;
                rmsg->any.lnk_span.proto_version = HAMMER2_SPAN_PROTO_1;
                name_len = pmp->iroot->ip_data.name_len;
                if (name_len >= sizeof(rmsg->any.lnk_span.label))