2b043d5d736e05d568e1c44187db1bc79355e823
[dragonfly.git] / sys / vfs / hammer2 / hammer2_synchro.c
1 /*
2  * Copyright (c) 2015-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51
52 typedef struct hammer2_deferred_ip {
53         struct hammer2_deferred_ip *next;
54         hammer2_inode_t *ip;
55 } hammer2_deferred_ip_t;
56
57 typedef struct hammer2_deferred_list {
58         hammer2_deferred_ip_t   *base;
59         int                     count;
60 } hammer2_deferred_list_t;
61
62
63 #define HAMMER2_SYNCHRO_DEBUG 1
64
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66                                 hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69                                 nerror = hammer2_sync_insert(
70                                                 thr, &parent, &chain,
71                                                 focus->bref.modify_tid,
72                                                 idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76                         hammer2_tid_t modify_tid, int idx,
77                         hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80                         hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82                         hammer2_chain_t *parent, hammer2_chain_t *chain,
83                         hammer2_tid_t mtid, int idx,
84                         hammer2_chain_t *focus, int isroot);
85
86 /****************************************************************************
87  *                          HAMMER2 SYNC THREADS                            *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102         hammer2_thread_t *thr = arg;
103         hammer2_pfs_t *pmp;
104         hammer2_deferred_list_t list;
105         hammer2_deferred_ip_t *defer;
106         int error;
107         uint32_t flags;
108         uint32_t nflags;
109
110         pmp = thr->pmp;
111         bzero(&list, sizeof(list));
112
113         for (;;) {
114                 flags = thr->flags;
115                 cpu_ccfence();
116
117                 /*
118                  * Handle stop request
119                  */
120                 if (flags & HAMMER2_THREAD_STOP)
121                         break;
122
123                 /*
124                  * Handle freeze request
125                  */
126                 if (flags & HAMMER2_THREAD_FREEZE) {
127                         nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
128                                             HAMMER2_THREAD_WAITING)) |
129                                  HAMMER2_THREAD_FROZEN;
130                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
131                                 continue;
132                         if (flags & HAMMER2_THREAD_WAITING)
133                                 wakeup(&thr->flags);
134                         continue;
135                 }
136
137                 if (flags & HAMMER2_THREAD_UNFREEZE) {
138                         nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
139                                            HAMMER2_THREAD_FROZEN |
140                                            HAMMER2_THREAD_WAITING);
141                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
142                                 continue;
143                         if (flags & HAMMER2_THREAD_WAITING)
144                                 wakeup(&thr->flags);
145                         continue;
146                 }
147
148                 /*
149                  * Force idle if frozen until unfrozen or stopped.
150                  */
151                 if (flags & HAMMER2_THREAD_FROZEN) {
152                         nflags = flags | HAMMER2_THREAD_WAITING;
153
154                         tsleep_interlock(&thr->flags, 0);
155                         if (atomic_cmpset_int(&thr->flags, flags, nflags))
156                                 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
157                         continue;
158                 }
159
160                 /*
161                  * Reset state on REMASTER request
162                  */
163                 if (thr->flags & HAMMER2_THREAD_REMASTER) {
164                         nflags = flags & ~HAMMER2_THREAD_REMASTER;
165                         if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
166                                 /* reset state here */
167                         }
168                         continue;
169                 }
170
171                 /*
172                  * Synchronization scan.
173                  */
174                 if (hammer2_debug & 0x8000)
175                         kprintf("sync_slaves pfs %s clindex %d\n",
176                                 pmp->pfs_names[thr->clindex], thr->clindex);
177                 hammer2_trans_init(pmp, 0);
178
179                 hammer2_inode_ref(pmp->iroot);
180
181                 for (;;) {
182                         int didbreak = 0;
183                         /* XXX lock synchronize pmp->modify_tid */
184                         error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
185                         if (hammer2_debug & 0x8000) {
186                                 kprintf("sync_slaves error %d defer %p\n",
187                                         error, list.base);
188                         }
189                         if (error != EAGAIN)
190                                 break;
191                         while ((defer = list.base) != NULL) {
192                                 hammer2_inode_t *nip;
193
194                                 nip = defer->ip;
195                                 error = hammer2_sync_slaves(thr, nip, &list,
196                                                         (nip == pmp->iroot));
197                                 if (error && error != EAGAIN && error != ENOENT)
198                                         break;
199                                 if (hammer2_thr_break(thr)) {
200                                         didbreak = 1;
201                                         break;
202                                 }
203
204                                 /*
205                                  * If no additional defers occurred we can
206                                  * remove this one, otherwise keep it on
207                                  * the list and retry once the additional
208                                  * defers have completed.
209                                  */
210                                 if (defer == list.base) {
211                                         --list.count;
212                                         list.base = defer->next;
213                                         kfree(defer, M_HAMMER2);
214                                         defer = NULL;   /* safety */
215                                         hammer2_inode_drop(nip);
216                                 }
217                         }
218
219                         /*
220                          * If the thread is being remastered, frozen, or
221                          * stopped, clean up any left-over deferals.
222                          */
223                         if (didbreak || (error && error != EAGAIN)) {
224                                 kprintf("didbreak\n");
225                                 while ((defer = list.base) != NULL) {
226                                         --list.count;
227                                         hammer2_inode_drop(defer->ip);
228                                         list.base = defer->next;
229                                         kfree(defer, M_HAMMER2);
230                                 }
231                                 if (error == 0 || error == EAGAIN)
232                                         error = EINPROGRESS;
233                                 break;
234                         }
235                 }
236
237                 hammer2_inode_drop(pmp->iroot);
238                 hammer2_trans_done(pmp);
239
240                 if (error && error != EINPROGRESS)
241                         kprintf("hammer2_sync_slaves: error %d\n", error);
242
243                 /*
244                  * Wait for event, or 5-second poll.
245                  */
246                 nflags = flags | HAMMER2_THREAD_WAITING;
247                 tsleep_interlock(&thr->flags, 0);
248                 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
249                         tsleep(&thr->flags, 0, "h2idle", hz * 5);
250                 }
251         }
252         thr->td = NULL;
253         hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
254         /* thr structure can go invalid after this point */
255 }
256
257 #if 0
258 /*
259  * Given a locked cluster created from pmp->iroot, update the PFS's
260  * reporting status.
261  */
262 static
263 void
264 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
265 {
266         hammer2_pfs_t *pmp = thr->pmp;
267
268         flags &= HAMMER2_CLUSTER_ZFLAGS;
269         if (pmp->cluster_flags == flags)
270                 return;
271         pmp->cluster_flags = flags;
272
273         kprintf("pfs %p", pmp);
274         if (flags & HAMMER2_CLUSTER_MSYNCED)
275                 kprintf(" masters-all-good");
276         if (flags & HAMMER2_CLUSTER_SSYNCED)
277                 kprintf(" slaves-all-good");
278
279         if (flags & HAMMER2_CLUSTER_WRHARD)
280                 kprintf(" quorum/rw");
281         else if (flags & HAMMER2_CLUSTER_RDHARD)
282                 kprintf(" quorum/ro");
283
284         if (flags & HAMMER2_CLUSTER_UNHARD)
285                 kprintf(" out-of-sync-masters");
286         else if (flags & HAMMER2_CLUSTER_NOHARD)
287                 kprintf(" no-masters-visible");
288
289         if (flags & HAMMER2_CLUSTER_WRSOFT)
290                 kprintf(" soft/rw");
291         else if (flags & HAMMER2_CLUSTER_RDSOFT)
292                 kprintf(" soft/ro");
293
294         if (flags & HAMMER2_CLUSTER_UNSOFT)
295                 kprintf(" out-of-sync-slaves");
296         else if (flags & HAMMER2_CLUSTER_NOSOFT)
297                 kprintf(" no-slaves-visible");
298         kprintf("\n");
299 }
300 #endif
301
302 #if 0
303 static
304 void
305 dumpcluster(const char *label,
306             hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
307 {
308         hammer2_chain_t *chain;
309         int i;
310
311         if ((hammer2_debug & 1) == 0)
312                 return;
313
314         kprintf("%s\t", label);
315         KKASSERT(cparent->nchains == cluster->nchains);
316         for (i = 0; i < cparent->nchains; ++i) {
317                 if (i)
318                         kprintf("\t");
319                 kprintf("%d ", i);
320                 if ((chain = cparent->array[i].chain) != NULL) {
321                         kprintf("%016jx%s ",
322                                 chain->bref.key,
323                                 ((cparent->array[i].flags &
324                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
325                         );
326                 } else {
327                         kprintf("      NULL      %s ", "   ");
328                 }
329                 if ((chain = cluster->array[i].chain) != NULL) {
330                         kprintf("%016jx%s ",
331                                 chain->bref.key,
332                                 ((cluster->array[i].flags &
333                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
334                         );
335                 } else {
336                         kprintf("      NULL      %s ", "   ");
337                 }
338                 kprintf("\n");
339         }
340 }
341 #endif
342
343 /*
344  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
345  * the inode.  This creates a multiplication effect since the XOP scan itself
346  * issues to all nodes.  However, this is the only way we can safely
347  * synchronize nodes which might have disparate I/O bandwidths and the only
348  * way we can safely deal with stalled nodes.
349  *
350  * XXX serror / merror rollup and handling.
351  */
352 static
353 int
354 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
355                     hammer2_deferred_list_t *list, int isroot)
356 {
357         hammer2_xop_scanall_t *xop;
358         hammer2_chain_t *parent;
359         hammer2_chain_t *chain;
360         hammer2_pfs_t *pmp;
361         hammer2_key_t key_next;
362         hammer2_tid_t sync_tid;
363         int needrescan;
364         int want_update;
365         int serror;             /* slave error */
366         int merror;             /* master error (from xop_collect) */
367         int nerror;             /* temporary error */
368         int idx;
369         int n;
370
371         pmp = ip->pmp;
372         idx = thr->clindex;     /* cluster node we are responsible for */
373         needrescan = 0;
374         want_update = 0;
375         sync_tid = 0;
376         chain = NULL;
377         parent = NULL;
378
379 #if 0
380         /*
381          * Nothing to do if all slaves are synchronized.
382          * Nothing to do if cluster not authoritatively readable.
383          */
384         if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
385                 return(0);
386         if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
387                 return(HAMMER2_ERROR_INCOMPLETE);
388 #endif
389
390         merror = 0;
391
392         /*
393          * Resolve the root inode of the PFS and determine if synchronization
394          * is needed by checking modify_tid.
395          *
396          * Retain the synchronization TID from the focus inode and use it
397          * later to synchronize the focus inode if/when the recursion
398          * succeeds.
399          */
400         {
401                 hammer2_xop_ipcluster_t *xop2;
402                 hammer2_chain_t *focus;
403
404                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
405                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
406                 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
407                                          idx);
408                 hammer2_inode_unlock(ip);
409                 merror = hammer2_xop_collect(&xop2->head, 0);
410                 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
411                         sync_tid = focus->bref.modify_tid;
412                         chain = hammer2_inode_chain_and_parent(ip, idx,
413                                                     &parent,
414                                                     HAMMER2_RESOLVE_ALWAYS |
415                                                     HAMMER2_RESOLVE_SHARED);
416                         want_update = (chain->bref.modify_tid != sync_tid);
417                         if (chain) {
418                                 hammer2_chain_unlock(chain);
419                                 hammer2_chain_drop(chain);
420                                 chain = NULL;
421                         }
422                         if (parent) {
423                                 hammer2_chain_unlock(parent);
424                                 hammer2_chain_drop(parent);
425                                 parent = NULL;
426                         }
427                 }
428                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
429         }
430
431         if (want_update == 0)
432                 return(0);
433
434         /*
435          * The inode is left unlocked during the scan.  Issue a XOP
436          * that does *not* include our cluster index to iterate
437          * properly synchronized elements and resolve our cluster index
438          * against it.
439          */
440         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
441         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
442         xop->key_beg = HAMMER2_KEY_MIN;
443         xop->key_end = HAMMER2_KEY_MAX;
444         xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
445                              HAMMER2_RESOLVE_ALWAYS;
446         xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
447                             HAMMER2_LOOKUP_NODIRECT |
448                             HAMMER2_LOOKUP_ALWAYS;
449         hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
450         parent = hammer2_inode_chain(ip, idx,
451                                      HAMMER2_RESOLVE_ALWAYS |
452                                      HAMMER2_RESOLVE_SHARED);
453         hammer2_inode_unlock(ip);
454
455         chain = hammer2_chain_lookup(&parent, &key_next,
456                                      HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
457                                      &serror,
458                                      HAMMER2_LOOKUP_SHARED |
459                                      HAMMER2_LOOKUP_NODIRECT |
460                                      HAMMER2_LOOKUP_NODATA);
461         serror = hammer2_error_to_errno(serror);
462         merror = hammer2_xop_collect(&xop->head, 0);
463         if (hammer2_debug & 0x8000) {
464                 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
465                         ip->meta.name_key, chain,
466                         (chain ? chain->bref.key : -1));
467         }
468
469         for (;;) {
470                 /*
471                  * We are done if our scan is done and the XOP scan is done.
472                  * We are done if the XOP scan failed (that is, we don't
473                  * have authoritative data to synchronize with).
474                  */
475                 int advance_local = 0;
476                 int advance_xop = 0;
477                 int dodefer = 0;
478                 hammer2_chain_t *focus;
479
480                 if (chain == NULL && merror == ENOENT)
481                         break;
482                 if (merror && merror != ENOENT)
483                         break;
484
485                 /*
486                  * Compare
487                  */
488                 if (chain && merror == ENOENT) {
489                         /*
490                          * If we have local chains but the XOP scan is done,
491                          * the chains need to be deleted.
492                          */
493                         n = -1;
494                         focus = NULL;
495                 } else if (chain == NULL) {
496                         /*
497                          * If our local scan is done but the XOP scan is not,
498                          * we need to create the missing chain(s).
499                          */
500                         n = 1;
501                         focus = xop->head.cluster.focus;
502                 } else {
503                         /*
504                          * Otherwise compare to determine the action
505                          * needed.
506                          */
507                         focus = xop->head.cluster.focus;
508                         n = hammer2_chain_cmp(chain, focus);
509                 }
510
511                 /*
512                  * Take action based on comparison results.
513                  */
514                 if (n < 0) {
515                         /*
516                          * Delete extranious local data.  This will
517                          * automatically advance the chain.
518                          */
519                         nerror = hammer2_sync_destroy(thr, &parent, &chain,
520                                                       0, idx);
521                 } else if (n == 0 && chain->bref.modify_tid !=
522                                      focus->bref.modify_tid) {
523                         /*
524                          * Matching key but local data or meta-data requires
525                          * updating.  If we will recurse, we still need to
526                          * update to compatible content first but we do not
527                          * synchronize modify_tid until the entire recursion
528                          * has completed successfully.
529                          */
530                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
531                                 nerror = hammer2_sync_replace(
532                                                 thr, parent, chain,
533                                                 0,
534                                                 idx, focus, 0);
535                                 dodefer = 1;
536                         } else {
537                                 nerror = hammer2_sync_replace(
538                                                 thr, parent, chain,
539                                                 focus->bref.modify_tid,
540                                                 idx, focus, 0);
541                         }
542                         advance_local = 1;
543                         advance_xop = 1;
544                 } else if (n == 0) {
545                         /*
546                          * 100% match, advance both
547                          */
548                         advance_local = 1;
549                         advance_xop = 1;
550                         nerror = 0;
551                 } else if (n > 0) {
552                         /*
553                          * Insert missing local data.
554                          *
555                          * If we will recurse, we still need to update to
556                          * compatible content first but we do not synchronize
557                          * modify_tid until the entire recursion has
558                          * completed successfully.
559                          */
560                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
561                                 nerror = hammer2_sync_insert(
562                                                 thr, &parent, &chain,
563                                                 0,
564                                                 idx, focus);
565                                 dodefer = 2;
566                         } else {
567                                 nerror = hammer2_sync_insert(
568                                                 thr, &parent, &chain,
569                                                 focus->bref.modify_tid,
570                                                 idx, focus);
571                         }
572                         advance_local = 1;
573                         advance_xop = 1;
574                 }
575
576                 /*
577                  * We cannot recurse depth-first because the XOP is still
578                  * running in node threads for this scan.  Create a placemarker
579                  * by obtaining and record the hammer2_inode.
580                  *
581                  * We excluded our node from the XOP so we must temporarily
582                  * add it to xop->head.cluster so it is properly incorporated
583                  * into the inode.
584                  *
585                  * The deferral is pushed onto a LIFO list for bottom-up
586                  * synchronization.
587                  */
588                 if (merror == 0 && dodefer) {
589                         hammer2_inode_t *nip;
590                         hammer2_deferred_ip_t *defer;
591
592                         KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
593
594                         defer = kmalloc(sizeof(*defer), M_HAMMER2,
595                                         M_WAITOK | M_ZERO);
596                         KKASSERT(xop->head.cluster.array[idx].chain == NULL);
597                         xop->head.cluster.array[idx].flags =
598                                                         HAMMER2_CITEM_INVALID;
599                         xop->head.cluster.array[idx].chain = chain;
600                         nip = hammer2_inode_get(pmp, ip,
601                                                 &xop->head.cluster, idx);
602                         xop->head.cluster.array[idx].chain = NULL;
603
604                         hammer2_inode_ref(nip);
605                         hammer2_inode_unlock(nip);
606
607                         defer->next = list->base;
608                         defer->ip = nip;
609                         list->base = defer;
610                         ++list->count;
611                         needrescan = 1;
612                 }
613
614                 /*
615                  * If at least one deferral was added and the deferral
616                  * list has grown too large, stop adding more.  This
617                  * will trigger an EAGAIN return.
618                  */
619                 if (needrescan && list->count > 1000)
620                         break;
621
622                 /*
623                  * Advancements for iteration.
624                  */
625                 if (advance_xop) {
626                         merror = hammer2_xop_collect(&xop->head, 0);
627                 }
628                 if (advance_local) {
629                         chain = hammer2_chain_next(&parent, chain, &key_next,
630                                                    key_next, HAMMER2_KEY_MAX,
631                                                    &serror,
632                                                    HAMMER2_LOOKUP_SHARED |
633                                                    HAMMER2_LOOKUP_NODIRECT |
634                                                    HAMMER2_LOOKUP_NODATA);
635                         serror = hammer2_error_to_errno(serror);
636                 }
637         }
638         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
639         if (chain) {
640                 hammer2_chain_unlock(chain);
641                 hammer2_chain_drop(chain);
642         }
643         if (parent) {
644                 hammer2_chain_unlock(parent);
645                 hammer2_chain_drop(parent);
646         }
647
648         /*
649          * If we added deferrals we want the caller to synchronize them
650          * and then call us again.
651          *
652          * NOTE: In this situation we do not yet want to synchronize our
653          *       inode, setting the error code also has that effect.
654          */
655         if ((merror == 0 || merror == ENOENT) && needrescan)
656                 merror = EAGAIN;
657
658         /*
659          * If no error occurred we can synchronize the inode meta-data
660          * and modify_tid.  Only limited changes are made to PFSROOTs.
661          *
662          * XXX inode lock was lost
663          */
664         if (merror == 0 || merror == ENOENT) {
665                 hammer2_xop_ipcluster_t *xop2;
666                 hammer2_chain_t *focus;
667
668                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
669                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
670                 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
671                                          idx);
672                 hammer2_inode_unlock(ip);
673                 merror = hammer2_xop_collect(&xop2->head, 0);
674                 if (merror == 0) {
675                         focus = xop2->head.cluster.focus;
676                         if (hammer2_debug & 0x8000) {
677                                 kprintf("syncthr: update inode %p (%s)\n",
678                                         focus,
679                                         (focus ? (char *)focus->data->
680                                                          ipdata.filename :
681                                                  "?"));
682                         }
683                         chain = hammer2_inode_chain_and_parent(ip, idx,
684                                                     &parent,
685                                                     HAMMER2_RESOLVE_ALWAYS |
686                                                     HAMMER2_RESOLVE_SHARED);
687
688                         KKASSERT(parent != NULL);
689                         nerror = hammer2_sync_replace(
690                                         thr, parent, chain,
691                                         sync_tid,
692                                         idx, focus, isroot);
693                         hammer2_chain_unlock(chain);
694                         hammer2_chain_drop(chain);
695                         hammer2_chain_unlock(parent);
696                         hammer2_chain_drop(parent);
697                         /* XXX */
698                 }
699                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
700         }
701
702         return merror;
703 }
704
705 /*
706  * Create a missing chain by copying the focus from another device.
707  *
708  * On entry *parentp and focus are both locked shared.  The chain will be
709  * created and returned in *chainp also locked shared.
710  */
711 static
712 int
713 hammer2_sync_insert(hammer2_thread_t *thr,
714                     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
715                     hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
716 {
717         hammer2_chain_t *chain;
718         hammer2_key_t dummy;
719         int error;
720
721 #if HAMMER2_SYNCHRO_DEBUG
722         if (hammer2_debug & 1)
723         kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
724                 *parentp, 
725                 (*parentp)->bref.type,
726                 (*parentp)->bref.key,
727                 idx,
728                 focus->bref.type, focus->bref.key, mtid);
729 #endif
730
731         /*
732          * Parent requires an exclusive lock for the insertion.
733          * We must unlock the child to avoid deadlocks while
734          * relocking the parent.
735          */
736         if (*chainp) {
737                 hammer2_chain_unlock(*chainp);
738                 hammer2_chain_drop(*chainp);
739                 *chainp = NULL;
740         }
741         hammer2_chain_unlock(*parentp);
742         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
743
744         /*
745          * We must reissue the lookup to properly position (*parentp)
746          * for the insertion.
747          */
748         chain = hammer2_chain_lookup(parentp, &dummy,
749                                      focus->bref.key, focus->bref.key,
750                                      &error,
751                                      HAMMER2_LOOKUP_NODIRECT |
752                                      HAMMER2_LOOKUP_ALWAYS);
753         KKASSERT(chain == NULL);
754
755         chain = NULL;
756         error = hammer2_chain_create(parentp, &chain,
757                                      thr->pmp, focus->bref.methods,
758                                      focus->bref.key, focus->bref.keybits,
759                                      focus->bref.type, focus->bytes,
760                                      mtid, 0, 0);
761         if (error == 0) {
762                 hammer2_chain_modify(chain, mtid, 0, 0);
763
764                 /*
765                  * Copy focus to new chain
766                  */
767
768                 /* type already set */
769                 chain->bref.methods = focus->bref.methods;
770                 /* keybits already set */
771                 chain->bref.vradix = focus->bref.vradix;
772                 /* mirror_tid set by flush */
773                 KKASSERT(chain->bref.modify_tid == mtid);
774                 chain->bref.flags = focus->bref.flags;
775                 /* key already present */
776                 /* check code will be recalculated */
777
778                 /*
779                  * Copy data body.
780                  */
781                 switch(chain->bref.type) {
782                 case HAMMER2_BREF_TYPE_INODE:
783                         if ((focus->data->ipdata.meta.op_flags &
784                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
785                                 /* do not copy block table */
786                                 bcopy(focus->data, chain->data,
787                                       offsetof(hammer2_inode_data_t, u));
788                                 break;
789                         }
790                         /* fall through copy whole thing */
791                 case HAMMER2_BREF_TYPE_DATA:
792                         bcopy(focus->data, chain->data, chain->bytes);
793                         hammer2_chain_setcheck(chain, chain->data);
794                         break;
795                 case HAMMER2_BREF_TYPE_DIRENT:
796                         /*
797                          * Directory entries embed data in the blockref.
798                          */
799                         if (chain->bytes) {
800                                 bcopy(focus->data, chain->data, chain->bytes);
801                                 hammer2_chain_setcheck(chain, chain->data);
802                         } else {
803                                 chain->bref.check = focus->bref.check;
804                         }
805                         chain->bref.embed = focus->bref.embed;
806                         break;
807                 default:
808                         KKASSERT(0);
809                         break;
810                 }
811         }
812
813         if (chain)
814                 hammer2_chain_unlock(chain);    /* unlock, leave ref */
815         *chainp = chain;                        /* will be returned locked */
816
817         /*
818          * Avoid an ordering deadlock when relocking shared.
819          */
820         hammer2_chain_unlock(*parentp);
821         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
822                                      HAMMER2_RESOLVE_ALWAYS);
823         if (chain) {
824                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
825                                           HAMMER2_RESOLVE_ALWAYS);
826                 error = chain->error;
827         }
828
829         return error;
830 }
831
832 /*
833  * Destroy an extranious chain.
834  *
835  * Both *parentp and *chainp are locked shared.
836  *
837  * On return, *chainp will be adjusted to point to the next element in the
838  * iteration and locked shared.
839  */
840 static
841 int
842 hammer2_sync_destroy(hammer2_thread_t *thr,
843                      hammer2_chain_t **parentp, hammer2_chain_t **chainp,
844                      hammer2_tid_t mtid, int idx)
845 {
846         hammer2_chain_t *chain;
847         hammer2_key_t key_next;
848         hammer2_key_t save_key;
849         int error;
850
851         chain = *chainp;
852
853 #if HAMMER2_SYNCHRO_DEBUG
854         if (hammer2_debug & 1)
855         kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
856                 *parentp, chain,
857                 idx, chain->bref.type, chain->bref.key);
858 #endif
859
860         save_key = chain->bref.key;
861         if (save_key != HAMMER2_KEY_MAX)
862                 ++save_key;
863
864         /*
865          * Try to avoid unnecessary I/O.
866          *
867          * XXX accounting not propagated up properly.  We might have to do
868          *     a RESOLVE_MAYBE here and pass 0 for the flags.
869          */
870         hammer2_chain_unlock(chain);    /* relock exclusive */
871         hammer2_chain_unlock(*parentp);
872         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
873         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
874
875         hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
876         hammer2_chain_unlock(chain);
877         hammer2_chain_drop(chain);
878         chain = NULL;                   /* safety */
879
880         hammer2_chain_unlock(*parentp); /* relock shared */
881         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
882                                      HAMMER2_RESOLVE_ALWAYS);
883         *chainp = hammer2_chain_lookup(parentp, &key_next,
884                                      save_key, HAMMER2_KEY_MAX,
885                                      &error,
886                                      HAMMER2_LOOKUP_SHARED |
887                                      HAMMER2_LOOKUP_NODIRECT |
888                                      HAMMER2_LOOKUP_NODATA);
889         return error;
890 }
891
892 /*
893  * cparent is locked exclusively, with an extra ref, cluster is not locked.
894  * Replace element [i] in the cluster.
895  */
896 static
897 int
898 hammer2_sync_replace(hammer2_thread_t *thr,
899                      hammer2_chain_t *parent, hammer2_chain_t *chain,
900                      hammer2_tid_t mtid, int idx,
901                      hammer2_chain_t *focus, int isroot)
902 {
903         uint8_t otype;
904         int nradix;
905
906 #if HAMMER2_SYNCHRO_DEBUG
907         if (hammer2_debug & 1)
908         kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
909                 chain,
910                 idx,
911                 focus->bref.type, focus->bref.key, mtid);
912 #endif
913         hammer2_chain_unlock(chain);
914         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
915         if (chain->error == 0) {
916                 if (chain->bytes != focus->bytes) {
917                         /* XXX what if compressed? */
918                         nradix = hammer2_getradix(chain->bytes);
919                         hammer2_chain_resize(chain, mtid, 0, nradix, 0);
920                 }
921                 hammer2_chain_modify(chain, mtid, 0, 0);
922                 otype = chain->bref.type;
923                 chain->bref.type = focus->bref.type;
924                 chain->bref.methods = focus->bref.methods;
925                 chain->bref.keybits = focus->bref.keybits;
926                 chain->bref.vradix = focus->bref.vradix;
927                 /* mirror_tid updated by flush */
928                 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
929                 chain->bref.flags = focus->bref.flags;
930                 /* key already present */
931                 /* check code will be recalculated */
932
933                 /*
934                  * Copy data body.
935                  */
936                 switch(chain->bref.type) {
937                 case HAMMER2_BREF_TYPE_INODE:
938                         /*
939                          * Special case PFSROOTs, only limited changes can
940                          * be made since the meta-data contains miscellanious
941                          * distinguishing fields.
942                          */
943                         if (isroot) {
944                                 chain->data->ipdata.meta.uflags =
945                                         focus->data->ipdata.meta.uflags;
946                                 chain->data->ipdata.meta.rmajor =
947                                         focus->data->ipdata.meta.rmajor;
948                                 chain->data->ipdata.meta.rminor =
949                                         focus->data->ipdata.meta.rminor;
950                                 chain->data->ipdata.meta.ctime =
951                                         focus->data->ipdata.meta.ctime;
952                                 chain->data->ipdata.meta.mtime =
953                                         focus->data->ipdata.meta.mtime;
954                                 chain->data->ipdata.meta.atime =
955                                         focus->data->ipdata.meta.atime;
956                                 /* not btime */
957                                 chain->data->ipdata.meta.uid =
958                                         focus->data->ipdata.meta.uid;
959                                 chain->data->ipdata.meta.gid =
960                                         focus->data->ipdata.meta.gid;
961                                 chain->data->ipdata.meta.mode =
962                                         focus->data->ipdata.meta.mode;
963                                 chain->data->ipdata.meta.ncopies =
964                                         focus->data->ipdata.meta.ncopies;
965                                 chain->data->ipdata.meta.comp_algo =
966                                         focus->data->ipdata.meta.comp_algo;
967                                 chain->data->ipdata.meta.check_algo =
968                                         focus->data->ipdata.meta.check_algo;
969                                 chain->data->ipdata.meta.data_quota =
970                                         focus->data->ipdata.meta.data_quota;
971                                 chain->data->ipdata.meta.inode_quota =
972                                         focus->data->ipdata.meta.inode_quota;
973
974                                 /*
975                                  * last snapshot tid controls overwrite
976                                  */
977                                 if (chain->data->ipdata.meta.pfs_lsnap_tid <
978                                     focus->data->ipdata.meta.pfs_lsnap_tid) {
979                                         chain->data->ipdata.meta.pfs_lsnap_tid =
980                                         focus->data->ipdata.meta.pfs_lsnap_tid;
981                                 }
982
983                                 hammer2_chain_setcheck(chain, chain->data);
984                                 break;
985                         }
986
987                         /*
988                          * Normal replacement.
989                          */
990                         if ((focus->data->ipdata.meta.op_flags &
991                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
992                                 /*
993                                  * If DIRECTDATA is transitioning to 0 or the
994                                  * old chain is not an inode we have to
995                                  * initialize the block table.
996                                  */
997                                 if (otype != HAMMER2_BREF_TYPE_INODE ||
998                                     (chain->data->ipdata.meta.op_flags &
999                                      HAMMER2_OPFLAG_DIRECTDATA)) {
1000                                         kprintf("chain inode trans "
1001                                                 "away from dd\n");
1002                                         bzero(&chain->data->ipdata.u,
1003                                               sizeof(chain->data->ipdata.u));
1004                                 }
1005                                 bcopy(focus->data, chain->data,
1006                                       offsetof(hammer2_inode_data_t, u));
1007                                 /* XXX setcheck on inode should not be needed */
1008                                 hammer2_chain_setcheck(chain, chain->data);
1009                                 break;
1010                         }
1011                         /* fall through */
1012                 case HAMMER2_BREF_TYPE_DATA:
1013                         bcopy(focus->data, chain->data, chain->bytes);
1014                         hammer2_chain_setcheck(chain, chain->data);
1015                         break;
1016                 case HAMMER2_BREF_TYPE_DIRENT:
1017                         /*
1018                          * Directory entries embed data in the blockref.
1019                          */
1020                         if (chain->bytes) {
1021                                 bcopy(focus->data, chain->data, chain->bytes);
1022                                 hammer2_chain_setcheck(chain, chain->data);
1023                         } else {
1024                                 chain->bref.check = focus->bref.check;
1025                         }
1026                         chain->bref.embed = focus->bref.embed;
1027                         break;
1028                 default:
1029                         KKASSERT(0);
1030                         break;
1031                 }
1032         }
1033
1034         hammer2_chain_unlock(chain);
1035         hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1036                                   HAMMER2_RESOLVE_MAYBE);
1037
1038         return 0;
1039 }