Merge branch 'vendor/TOP'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_synchro.c
1 /*
2  * Copyright (c) 2015-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51
52 typedef struct hammer2_deferred_ip {
53         struct hammer2_deferred_ip *next;
54         hammer2_inode_t *ip;
55 } hammer2_deferred_ip_t;
56
57 typedef struct hammer2_deferred_list {
58         hammer2_deferred_ip_t   *base;
59         int                     count;
60 } hammer2_deferred_list_t;
61
62
63 #define HAMMER2_SYNCHRO_DEBUG 1
64
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66                                 hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69                                 nerror = hammer2_sync_insert(
70                                                 thr, &parent, &chain,
71                                                 focus->bref.modify_tid,
72                                                 idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76                         hammer2_tid_t modify_tid, int idx,
77                         hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80                         hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82                         hammer2_chain_t *parent, hammer2_chain_t *chain,
83                         hammer2_tid_t mtid, int idx,
84                         hammer2_chain_t *focus, int isroot);
85
86 /****************************************************************************
87  *                          HAMMER2 SYNC THREADS                            *
88  ****************************************************************************/
89 /*
90  * Primary management thread for an element of a node.  A thread will exist
91  * for each element requiring management.
92  *
93  * No management threads are needed for the SPMP or for any PMP with only
94  * a single MASTER.
95  *
96  * On the SPMP - handles bulkfree and dedup operations
97  * On a PFS    - handles remastering and synchronization
98  */
99 void
100 hammer2_primary_sync_thread(void *arg)
101 {
102         hammer2_thread_t *thr = arg;
103         hammer2_pfs_t *pmp;
104         hammer2_deferred_list_t list;
105         hammer2_deferred_ip_t *defer;
106         int error;
107         uint32_t flags;
108         uint32_t nflags;
109
110         pmp = thr->pmp;
111         bzero(&list, sizeof(list));
112
113         for (;;) {
114                 flags = thr->flags;
115                 cpu_ccfence();
116
117                 /*
118                  * Handle stop request
119                  */
120                 if (flags & HAMMER2_THREAD_STOP)
121                         break;
122
123                 /*
124                  * Handle freeze request
125                  */
126                 if (flags & HAMMER2_THREAD_FREEZE) {
127                         nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
128                                             HAMMER2_THREAD_WAITING)) |
129                                  HAMMER2_THREAD_FROZEN;
130                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
131                                 continue;
132                         if (flags & HAMMER2_THREAD_WAITING)
133                                 wakeup(&thr->flags);
134                         continue;
135                 }
136
137                 if (flags & HAMMER2_THREAD_UNFREEZE) {
138                         nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
139                                            HAMMER2_THREAD_FROZEN |
140                                            HAMMER2_THREAD_WAITING);
141                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
142                                 continue;
143                         if (flags & HAMMER2_THREAD_WAITING)
144                                 wakeup(&thr->flags);
145                         continue;
146                 }
147
148                 /*
149                  * Force idle if frozen until unfrozen or stopped.
150                  */
151                 if (flags & HAMMER2_THREAD_FROZEN) {
152                         nflags = flags | HAMMER2_THREAD_WAITING;
153
154                         tsleep_interlock(&thr->flags, 0);
155                         if (atomic_cmpset_int(&thr->flags, flags, nflags))
156                                 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
157                         continue;
158                 }
159
160                 /*
161                  * Reset state on REMASTER request
162                  */
163                 if (thr->flags & HAMMER2_THREAD_REMASTER) {
164                         nflags = flags & ~HAMMER2_THREAD_REMASTER;
165                         if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
166                                 /* reset state here */
167                         }
168                         continue;
169                 }
170
171                 /*
172                  * Synchronization scan.
173                  */
174                 if (hammer2_debug & 0x8000)
175                         kprintf("sync_slaves pfs %s clindex %d\n",
176                                 pmp->pfs_names[thr->clindex], thr->clindex);
177                 hammer2_trans_init(pmp, 0);
178
179                 hammer2_inode_ref(pmp->iroot);
180
181                 for (;;) {
182                         int didbreak = 0;
183                         /* XXX lock synchronize pmp->modify_tid */
184                         error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
185                         if (hammer2_debug & 0x8000) {
186                                 kprintf("sync_slaves error %d defer %p\n",
187                                         error, list.base);
188                         }
189                         if (error != HAMMER2_ERROR_EAGAIN)
190                                 break;
191                         while ((defer = list.base) != NULL) {
192                                 hammer2_inode_t *nip;
193
194                                 nip = defer->ip;
195                                 error = hammer2_sync_slaves(thr, nip, &list,
196                                                         (nip == pmp->iroot));
197                                 if (error &&
198                                     error != HAMMER2_ERROR_EAGAIN &&
199                                     error != HAMMER2_ERROR_ENOENT) {
200                                         break;
201                                 }
202                                 if (hammer2_thr_break(thr)) {
203                                         didbreak = 1;
204                                         break;
205                                 }
206
207                                 /*
208                                  * If no additional defers occurred we can
209                                  * remove this one, otherwise keep it on
210                                  * the list and retry once the additional
211                                  * defers have completed.
212                                  */
213                                 if (defer == list.base) {
214                                         --list.count;
215                                         list.base = defer->next;
216                                         kfree(defer, M_HAMMER2);
217                                         defer = NULL;   /* safety */
218                                         hammer2_inode_drop(nip);
219                                 }
220                         }
221
222                         /*
223                          * If the thread is being remastered, frozen, or
224                          * stopped, clean up any left-over deferals.
225                          */
226                         if (didbreak ||
227                             (error && error != HAMMER2_ERROR_EAGAIN)) {
228                                 kprintf("didbreak\n");
229                                 while ((defer = list.base) != NULL) {
230                                         --list.count;
231                                         hammer2_inode_drop(defer->ip);
232                                         list.base = defer->next;
233                                         kfree(defer, M_HAMMER2);
234                                 }
235                                 if (error == 0 || error == HAMMER2_ERROR_EAGAIN)
236                                         error = HAMMER2_ERROR_EINPROGRESS;
237                                 break;
238                         }
239                 }
240
241                 hammer2_inode_drop(pmp->iroot);
242                 hammer2_trans_done(pmp);
243
244                 if (error && error != HAMMER2_ERROR_EINPROGRESS)
245                         kprintf("hammer2_sync_slaves: error %d\n", error);
246
247                 /*
248                  * Wait for event, or 5-second poll.
249                  */
250                 nflags = flags | HAMMER2_THREAD_WAITING;
251                 tsleep_interlock(&thr->flags, 0);
252                 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
253                         tsleep(&thr->flags, 0, "h2idle", hz * 5);
254                 }
255         }
256         thr->td = NULL;
257         hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
258         /* thr structure can go invalid after this point */
259 }
260
261 #if 0
262 /*
263  * Given a locked cluster created from pmp->iroot, update the PFS's
264  * reporting status.
265  */
266 static
267 void
268 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
269 {
270         hammer2_pfs_t *pmp = thr->pmp;
271
272         flags &= HAMMER2_CLUSTER_ZFLAGS;
273         if (pmp->cluster_flags == flags)
274                 return;
275         pmp->cluster_flags = flags;
276
277         kprintf("pfs %p", pmp);
278         if (flags & HAMMER2_CLUSTER_MSYNCED)
279                 kprintf(" masters-all-good");
280         if (flags & HAMMER2_CLUSTER_SSYNCED)
281                 kprintf(" slaves-all-good");
282
283         if (flags & HAMMER2_CLUSTER_WRHARD)
284                 kprintf(" quorum/rw");
285         else if (flags & HAMMER2_CLUSTER_RDHARD)
286                 kprintf(" quorum/ro");
287
288         if (flags & HAMMER2_CLUSTER_UNHARD)
289                 kprintf(" out-of-sync-masters");
290         else if (flags & HAMMER2_CLUSTER_NOHARD)
291                 kprintf(" no-masters-visible");
292
293         if (flags & HAMMER2_CLUSTER_WRSOFT)
294                 kprintf(" soft/rw");
295         else if (flags & HAMMER2_CLUSTER_RDSOFT)
296                 kprintf(" soft/ro");
297
298         if (flags & HAMMER2_CLUSTER_UNSOFT)
299                 kprintf(" out-of-sync-slaves");
300         else if (flags & HAMMER2_CLUSTER_NOSOFT)
301                 kprintf(" no-slaves-visible");
302         kprintf("\n");
303 }
304 #endif
305
306 #if 0
307 static
308 void
309 dumpcluster(const char *label,
310             hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
311 {
312         hammer2_chain_t *chain;
313         int i;
314
315         if ((hammer2_debug & 1) == 0)
316                 return;
317
318         kprintf("%s\t", label);
319         KKASSERT(cparent->nchains == cluster->nchains);
320         for (i = 0; i < cparent->nchains; ++i) {
321                 if (i)
322                         kprintf("\t");
323                 kprintf("%d ", i);
324                 if ((chain = cparent->array[i].chain) != NULL) {
325                         kprintf("%016jx%s ",
326                                 chain->bref.key,
327                                 ((cparent->array[i].flags &
328                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
329                         );
330                 } else {
331                         kprintf("      NULL      %s ", "   ");
332                 }
333                 if ((chain = cluster->array[i].chain) != NULL) {
334                         kprintf("%016jx%s ",
335                                 chain->bref.key,
336                                 ((cluster->array[i].flags &
337                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
338                         );
339                 } else {
340                         kprintf("      NULL      %s ", "   ");
341                 }
342                 kprintf("\n");
343         }
344 }
345 #endif
346
347 /*
348  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
349  * the inode.  This creates a multiplication effect since the XOP scan itself
350  * issues to all nodes.  However, this is the only way we can safely
351  * synchronize nodes which might have disparate I/O bandwidths and the only
352  * way we can safely deal with stalled nodes.
353  *
354  * XXX serror / merror rollup and handling.
355  */
356 static
357 int
358 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
359                     hammer2_deferred_list_t *list, int isroot)
360 {
361         hammer2_xop_scanall_t *xop;
362         hammer2_chain_t *parent;
363         hammer2_chain_t *chain;
364         hammer2_pfs_t *pmp;
365         hammer2_key_t key_next;
366         hammer2_tid_t sync_tid;
367         int needrescan;
368         int want_update;
369         int serror;             /* slave error */
370         int merror;             /* master error (from xop_collect) */
371         int nerror;             /* temporary error */
372         int idx;
373         int n;
374
375         pmp = ip->pmp;
376         idx = thr->clindex;     /* cluster node we are responsible for */
377         needrescan = 0;
378         want_update = 0;
379         sync_tid = 0;
380         chain = NULL;
381         parent = NULL;
382
383 #if 0
384         /*
385          * Nothing to do if all slaves are synchronized.
386          * Nothing to do if cluster not authoritatively readable.
387          */
388         if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
389                 return(0);
390         if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
391                 return(HAMMER2_ERROR_INCOMPLETE);
392 #endif
393
394         merror = 0;
395
396         /*
397          * Resolve the root inode of the PFS and determine if synchronization
398          * is needed by checking modify_tid.
399          *
400          * Retain the synchronization TID from the focus inode and use it
401          * later to synchronize the focus inode if/when the recursion
402          * succeeds.
403          */
404         {
405                 hammer2_xop_ipcluster_t *xop2;
406                 hammer2_chain_t *focus;
407
408                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
409                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
410                 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
411                                          idx);
412                 hammer2_inode_unlock(ip);
413                 merror = hammer2_xop_collect(&xop2->head, 0);
414                 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
415                         sync_tid = focus->bref.modify_tid;
416                         chain = hammer2_inode_chain_and_parent(ip, idx,
417                                                     &parent,
418                                                     HAMMER2_RESOLVE_ALWAYS |
419                                                     HAMMER2_RESOLVE_SHARED);
420                         want_update = (chain->bref.modify_tid != sync_tid);
421                         if (chain) {
422                                 hammer2_chain_unlock(chain);
423                                 hammer2_chain_drop(chain);
424                                 chain = NULL;
425                         }
426                         if (parent) {
427                                 hammer2_chain_unlock(parent);
428                                 hammer2_chain_drop(parent);
429                                 parent = NULL;
430                         }
431                 }
432                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
433         }
434
435         if (want_update == 0)
436                 return(0);
437
438         /*
439          * The inode is left unlocked during the scan.  Issue a XOP
440          * that does *not* include our cluster index to iterate
441          * properly synchronized elements and resolve our cluster index
442          * against it.
443          */
444         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
445         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
446         xop->key_beg = HAMMER2_KEY_MIN;
447         xop->key_end = HAMMER2_KEY_MAX;
448         xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
449                              HAMMER2_RESOLVE_ALWAYS;
450         xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
451                             HAMMER2_LOOKUP_NODIRECT |
452                             HAMMER2_LOOKUP_ALWAYS;
453         hammer2_xop_start_except(&xop->head, hammer2_xop_scanall, idx);
454         parent = hammer2_inode_chain(ip, idx,
455                                      HAMMER2_RESOLVE_ALWAYS |
456                                      HAMMER2_RESOLVE_SHARED);
457         hammer2_inode_unlock(ip);
458
459         chain = hammer2_chain_lookup(&parent, &key_next,
460                                      HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
461                                      &serror,
462                                      HAMMER2_LOOKUP_SHARED |
463                                      HAMMER2_LOOKUP_NODIRECT |
464                                      HAMMER2_LOOKUP_NODATA);
465         merror = hammer2_xop_collect(&xop->head, 0);
466         if (hammer2_debug & 0x8000) {
467                 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
468                         ip->meta.name_key, chain,
469                         (chain ? chain->bref.key : -1));
470         }
471
472         for (;;) {
473                 /*
474                  * We are done if our scan is done and the XOP scan is done.
475                  * We are done if the XOP scan failed (that is, we don't
476                  * have authoritative data to synchronize with).
477                  */
478                 int advance_local = 0;
479                 int advance_xop = 0;
480                 int dodefer = 0;
481                 hammer2_chain_t *focus;
482
483                 if (chain == NULL && merror == HAMMER2_ERROR_ENOENT)
484                         break;
485                 if (merror && merror != HAMMER2_ERROR_ENOENT)
486                         break;
487
488                 /*
489                  * Compare
490                  */
491                 if (chain && merror == HAMMER2_ERROR_ENOENT) {
492                         /*
493                          * If we have local chains but the XOP scan is done,
494                          * the chains need to be deleted.
495                          */
496                         n = -1;
497                         focus = NULL;
498                 } else if (chain == NULL) {
499                         /*
500                          * If our local scan is done but the XOP scan is not,
501                          * we need to create the missing chain(s).
502                          */
503                         n = 1;
504                         focus = xop->head.cluster.focus;
505                 } else {
506                         /*
507                          * Otherwise compare to determine the action
508                          * needed.
509                          */
510                         focus = xop->head.cluster.focus;
511                         n = hammer2_chain_cmp(chain, focus);
512                 }
513
514                 /*
515                  * Take action based on comparison results.
516                  */
517                 if (n < 0) {
518                         /*
519                          * Delete extranious local data.  This will
520                          * automatically advance the chain.
521                          */
522                         nerror = hammer2_sync_destroy(thr, &parent, &chain,
523                                                       0, idx);
524                 } else if (n == 0 && chain->bref.modify_tid !=
525                                      focus->bref.modify_tid) {
526                         /*
527                          * Matching key but local data or meta-data requires
528                          * updating.  If we will recurse, we still need to
529                          * update to compatible content first but we do not
530                          * synchronize modify_tid until the entire recursion
531                          * has completed successfully.
532                          */
533                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
534                                 nerror = hammer2_sync_replace(
535                                                 thr, parent, chain,
536                                                 0,
537                                                 idx, focus, 0);
538                                 dodefer = 1;
539                         } else {
540                                 nerror = hammer2_sync_replace(
541                                                 thr, parent, chain,
542                                                 focus->bref.modify_tid,
543                                                 idx, focus, 0);
544                         }
545                         advance_local = 1;
546                         advance_xop = 1;
547                 } else if (n == 0) {
548                         /*
549                          * 100% match, advance both
550                          */
551                         advance_local = 1;
552                         advance_xop = 1;
553                         nerror = 0;
554                 } else if (n > 0) {
555                         /*
556                          * Insert missing local data.
557                          *
558                          * If we will recurse, we still need to update to
559                          * compatible content first but we do not synchronize
560                          * modify_tid until the entire recursion has
561                          * completed successfully.
562                          */
563                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
564                                 nerror = hammer2_sync_insert(
565                                                 thr, &parent, &chain,
566                                                 0,
567                                                 idx, focus);
568                                 dodefer = 2;
569                         } else {
570                                 nerror = hammer2_sync_insert(
571                                                 thr, &parent, &chain,
572                                                 focus->bref.modify_tid,
573                                                 idx, focus);
574                         }
575                         advance_local = 1;
576                         advance_xop = 1;
577                 }
578
579                 /*
580                  * We cannot recurse depth-first because the XOP is still
581                  * running in node threads for this scan.  Create a placemarker
582                  * by obtaining and record the hammer2_inode.
583                  *
584                  * We excluded our node from the XOP so we must temporarily
585                  * add it to xop->head.cluster so it is properly incorporated
586                  * into the inode.
587                  *
588                  * The deferral is pushed onto a LIFO list for bottom-up
589                  * synchronization.
590                  */
591                 if (merror == 0 && dodefer) {
592                         hammer2_inode_t *nip;
593                         hammer2_deferred_ip_t *defer;
594
595                         KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
596
597                         defer = kmalloc(sizeof(*defer), M_HAMMER2,
598                                         M_WAITOK | M_ZERO);
599                         KKASSERT(xop->head.cluster.array[idx].chain == NULL);
600                         xop->head.cluster.array[idx].flags =
601                                                         HAMMER2_CITEM_INVALID;
602                         xop->head.cluster.array[idx].chain = chain;
603                         nip = hammer2_inode_get(pmp, ip,
604                                                 &xop->head.cluster, idx);
605                         xop->head.cluster.array[idx].chain = NULL;
606
607                         hammer2_inode_ref(nip);
608                         hammer2_inode_unlock(nip);
609
610                         defer->next = list->base;
611                         defer->ip = nip;
612                         list->base = defer;
613                         ++list->count;
614                         needrescan = 1;
615                 }
616
617                 /*
618                  * If at least one deferral was added and the deferral
619                  * list has grown too large, stop adding more.  This
620                  * will trigger an HAMMER2_ERROR_EAGAIN return.
621                  */
622                 if (needrescan && list->count > 1000)
623                         break;
624
625                 /*
626                  * Advancements for iteration.
627                  */
628                 if (advance_xop) {
629                         merror = hammer2_xop_collect(&xop->head, 0);
630                 }
631                 if (advance_local) {
632                         chain = hammer2_chain_next(&parent, chain, &key_next,
633                                                    key_next, HAMMER2_KEY_MAX,
634                                                    &serror,
635                                                    HAMMER2_LOOKUP_SHARED |
636                                                    HAMMER2_LOOKUP_NODIRECT |
637                                                    HAMMER2_LOOKUP_NODATA);
638                 }
639         }
640         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
641         if (chain) {
642                 hammer2_chain_unlock(chain);
643                 hammer2_chain_drop(chain);
644         }
645         if (parent) {
646                 hammer2_chain_unlock(parent);
647                 hammer2_chain_drop(parent);
648         }
649
650         /*
651          * If we added deferrals we want the caller to synchronize them
652          * and then call us again.
653          *
654          * NOTE: In this situation we do not yet want to synchronize our
655          *       inode, setting the error code also has that effect.
656          */
657         if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan)
658                 merror = HAMMER2_ERROR_EAGAIN;
659
660         /*
661          * If no error occurred we can synchronize the inode meta-data
662          * and modify_tid.  Only limited changes are made to PFSROOTs.
663          *
664          * XXX inode lock was lost
665          */
666         if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) {
667                 hammer2_xop_ipcluster_t *xop2;
668                 hammer2_chain_t *focus;
669
670                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
671                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
672                 hammer2_xop_start_except(&xop2->head, hammer2_xop_ipcluster,
673                                          idx);
674                 hammer2_inode_unlock(ip);
675                 merror = hammer2_xop_collect(&xop2->head, 0);
676                 if (merror == 0) {
677                         focus = xop2->head.cluster.focus;
678                         if (hammer2_debug & 0x8000) {
679                                 kprintf("syncthr: update inode %p (%s)\n",
680                                         focus,
681                                         (focus ? (char *)focus->data->
682                                                          ipdata.filename :
683                                                  "?"));
684                         }
685                         chain = hammer2_inode_chain_and_parent(ip, idx,
686                                                     &parent,
687                                                     HAMMER2_RESOLVE_ALWAYS |
688                                                     HAMMER2_RESOLVE_SHARED);
689
690                         KKASSERT(parent != NULL);
691                         nerror = hammer2_sync_replace(
692                                         thr, parent, chain,
693                                         sync_tid,
694                                         idx, focus, isroot);
695                         hammer2_chain_unlock(chain);
696                         hammer2_chain_drop(chain);
697                         hammer2_chain_unlock(parent);
698                         hammer2_chain_drop(parent);
699                         /* XXX */
700                 }
701                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
702         }
703
704         return merror;
705 }
706
707 /*
708  * Create a missing chain by copying the focus from another device.
709  *
710  * On entry *parentp and focus are both locked shared.  The chain will be
711  * created and returned in *chainp also locked shared.
712  */
713 static
714 int
715 hammer2_sync_insert(hammer2_thread_t *thr,
716                     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
717                     hammer2_tid_t mtid, int idx, hammer2_chain_t *focus)
718 {
719         hammer2_chain_t *chain;
720         hammer2_key_t dummy;
721         int error;
722
723 #if HAMMER2_SYNCHRO_DEBUG
724         if (hammer2_debug & 1)
725         kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
726                 *parentp, 
727                 (*parentp)->bref.type,
728                 (*parentp)->bref.key,
729                 idx,
730                 focus->bref.type, focus->bref.key, mtid);
731 #endif
732
733         /*
734          * Parent requires an exclusive lock for the insertion.
735          * We must unlock the child to avoid deadlocks while
736          * relocking the parent.
737          */
738         if (*chainp) {
739                 hammer2_chain_unlock(*chainp);
740                 hammer2_chain_drop(*chainp);
741                 *chainp = NULL;
742         }
743         hammer2_chain_unlock(*parentp);
744         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
745
746         /*
747          * We must reissue the lookup to properly position (*parentp)
748          * for the insertion.
749          */
750         chain = hammer2_chain_lookup(parentp, &dummy,
751                                      focus->bref.key, focus->bref.key,
752                                      &error,
753                                      HAMMER2_LOOKUP_NODIRECT |
754                                      HAMMER2_LOOKUP_ALWAYS);
755         KKASSERT(chain == NULL);
756
757         chain = NULL;
758         error = hammer2_chain_create(parentp, &chain,
759                                      thr->pmp, focus->bref.methods,
760                                      focus->bref.key, focus->bref.keybits,
761                                      focus->bref.type, focus->bytes,
762                                      mtid, 0, 0);
763         if (error == 0) {
764                 error = hammer2_chain_modify(chain, mtid, 0, 0);
765                 if (error)
766                         goto failed;
767
768                 /*
769                  * Copy focus to new chain
770                  */
771
772                 /* type already set */
773                 chain->bref.methods = focus->bref.methods;
774                 /* keybits already set */
775                 chain->bref.vradix = focus->bref.vradix;
776                 /* mirror_tid set by flush */
777                 KKASSERT(chain->bref.modify_tid == mtid);
778                 chain->bref.flags = focus->bref.flags;
779                 /* key already present */
780                 /* check code will be recalculated */
781
782                 /*
783                  * Copy data body.
784                  */
785                 switch(chain->bref.type) {
786                 case HAMMER2_BREF_TYPE_INODE:
787                         if ((focus->data->ipdata.meta.op_flags &
788                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
789                                 /* do not copy block table */
790                                 bcopy(focus->data, chain->data,
791                                       offsetof(hammer2_inode_data_t, u));
792                                 break;
793                         }
794                         /* fall through copy whole thing */
795                 case HAMMER2_BREF_TYPE_DATA:
796                         bcopy(focus->data, chain->data, chain->bytes);
797                         hammer2_chain_setcheck(chain, chain->data);
798                         break;
799                 case HAMMER2_BREF_TYPE_DIRENT:
800                         /*
801                          * Directory entries embed data in the blockref.
802                          */
803                         if (chain->bytes) {
804                                 bcopy(focus->data, chain->data, chain->bytes);
805                                 hammer2_chain_setcheck(chain, chain->data);
806                         } else {
807                                 chain->bref.check = focus->bref.check;
808                         }
809                         chain->bref.embed = focus->bref.embed;
810                         break;
811                 default:
812                         KKASSERT(0);
813                         break;
814                 }
815         }
816
817 failed:
818         if (chain)
819                 hammer2_chain_unlock(chain);    /* unlock, leave ref */
820         *chainp = chain;                        /* will be returned locked */
821
822         /*
823          * Avoid an ordering deadlock when relocking shared.
824          */
825         hammer2_chain_unlock(*parentp);
826         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
827                                      HAMMER2_RESOLVE_ALWAYS);
828         if (chain) {
829                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
830                                           HAMMER2_RESOLVE_ALWAYS);
831                 error = chain->error;
832         }
833
834         return error;
835 }
836
837 /*
838  * Destroy an extranious chain.
839  *
840  * Both *parentp and *chainp are locked shared.
841  *
842  * On return, *chainp will be adjusted to point to the next element in the
843  * iteration and locked shared.
844  */
845 static
846 int
847 hammer2_sync_destroy(hammer2_thread_t *thr,
848                      hammer2_chain_t **parentp, hammer2_chain_t **chainp,
849                      hammer2_tid_t mtid, int idx)
850 {
851         hammer2_chain_t *chain;
852         hammer2_key_t key_next;
853         hammer2_key_t save_key;
854         int error;
855
856         chain = *chainp;
857
858 #if HAMMER2_SYNCHRO_DEBUG
859         if (hammer2_debug & 1)
860         kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
861                 *parentp, chain,
862                 idx, chain->bref.type, chain->bref.key);
863 #endif
864
865         save_key = chain->bref.key;
866         if (save_key != HAMMER2_KEY_MAX)
867                 ++save_key;
868
869         /*
870          * Try to avoid unnecessary I/O.
871          *
872          * XXX accounting not propagated up properly.  We might have to do
873          *     a RESOLVE_MAYBE here and pass 0 for the flags.
874          */
875         hammer2_chain_unlock(chain);    /* relock exclusive */
876         hammer2_chain_unlock(*parentp);
877         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
878         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
879
880         hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
881         hammer2_chain_unlock(chain);
882         hammer2_chain_drop(chain);
883         chain = NULL;                   /* safety */
884
885         hammer2_chain_unlock(*parentp); /* relock shared */
886         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
887                                      HAMMER2_RESOLVE_ALWAYS);
888         *chainp = hammer2_chain_lookup(parentp, &key_next,
889                                      save_key, HAMMER2_KEY_MAX,
890                                      &error,
891                                      HAMMER2_LOOKUP_SHARED |
892                                      HAMMER2_LOOKUP_NODIRECT |
893                                      HAMMER2_LOOKUP_NODATA);
894         return error;
895 }
896
897 /*
898  * cparent is locked exclusively, with an extra ref, cluster is not locked.
899  * Replace element [i] in the cluster.
900  */
901 static
902 int
903 hammer2_sync_replace(hammer2_thread_t *thr,
904                      hammer2_chain_t *parent, hammer2_chain_t *chain,
905                      hammer2_tid_t mtid, int idx,
906                      hammer2_chain_t *focus, int isroot)
907 {
908         uint8_t otype;
909         int nradix;
910         int error;
911
912 #if HAMMER2_SYNCHRO_DEBUG
913         if (hammer2_debug & 1)
914         kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
915                 chain,
916                 idx,
917                 focus->bref.type, focus->bref.key, mtid);
918 #endif
919         hammer2_chain_unlock(chain);
920         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
921         error = chain->error;
922         if (error == 0) {
923                 if (chain->bytes != focus->bytes) {
924                         /* XXX what if compressed? */
925                         nradix = hammer2_getradix(chain->bytes);
926                         error = hammer2_chain_resize(chain, mtid, 0, nradix, 0);
927                         if (error)
928                                 goto failed;
929                 }
930                 error = hammer2_chain_modify(chain, mtid, 0, 0);
931                 if (error)
932                         goto failed;
933                 otype = chain->bref.type;
934                 chain->bref.type = focus->bref.type;
935                 chain->bref.methods = focus->bref.methods;
936                 chain->bref.keybits = focus->bref.keybits;
937                 chain->bref.vradix = focus->bref.vradix;
938                 /* mirror_tid updated by flush */
939                 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
940                 chain->bref.flags = focus->bref.flags;
941                 /* key already present */
942                 /* check code will be recalculated */
943
944                 /*
945                  * Copy data body.
946                  */
947                 switch(chain->bref.type) {
948                 case HAMMER2_BREF_TYPE_INODE:
949                         /*
950                          * Special case PFSROOTs, only limited changes can
951                          * be made since the meta-data contains miscellanious
952                          * distinguishing fields.
953                          */
954                         if (isroot) {
955                                 chain->data->ipdata.meta.uflags =
956                                         focus->data->ipdata.meta.uflags;
957                                 chain->data->ipdata.meta.rmajor =
958                                         focus->data->ipdata.meta.rmajor;
959                                 chain->data->ipdata.meta.rminor =
960                                         focus->data->ipdata.meta.rminor;
961                                 chain->data->ipdata.meta.ctime =
962                                         focus->data->ipdata.meta.ctime;
963                                 chain->data->ipdata.meta.mtime =
964                                         focus->data->ipdata.meta.mtime;
965                                 chain->data->ipdata.meta.atime =
966                                         focus->data->ipdata.meta.atime;
967                                 /* not btime */
968                                 chain->data->ipdata.meta.uid =
969                                         focus->data->ipdata.meta.uid;
970                                 chain->data->ipdata.meta.gid =
971                                         focus->data->ipdata.meta.gid;
972                                 chain->data->ipdata.meta.mode =
973                                         focus->data->ipdata.meta.mode;
974                                 chain->data->ipdata.meta.ncopies =
975                                         focus->data->ipdata.meta.ncopies;
976                                 chain->data->ipdata.meta.comp_algo =
977                                         focus->data->ipdata.meta.comp_algo;
978                                 chain->data->ipdata.meta.check_algo =
979                                         focus->data->ipdata.meta.check_algo;
980                                 chain->data->ipdata.meta.data_quota =
981                                         focus->data->ipdata.meta.data_quota;
982                                 chain->data->ipdata.meta.inode_quota =
983                                         focus->data->ipdata.meta.inode_quota;
984
985                                 /*
986                                  * last snapshot tid controls overwrite
987                                  */
988                                 if (chain->data->ipdata.meta.pfs_lsnap_tid <
989                                     focus->data->ipdata.meta.pfs_lsnap_tid) {
990                                         chain->data->ipdata.meta.pfs_lsnap_tid =
991                                         focus->data->ipdata.meta.pfs_lsnap_tid;
992                                 }
993
994                                 hammer2_chain_setcheck(chain, chain->data);
995                                 break;
996                         }
997
998                         /*
999                          * Normal replacement.
1000                          */
1001                         if ((focus->data->ipdata.meta.op_flags &
1002                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1003                                 /*
1004                                  * If DIRECTDATA is transitioning to 0 or the
1005                                  * old chain is not an inode we have to
1006                                  * initialize the block table.
1007                                  */
1008                                 if (otype != HAMMER2_BREF_TYPE_INODE ||
1009                                     (chain->data->ipdata.meta.op_flags &
1010                                      HAMMER2_OPFLAG_DIRECTDATA)) {
1011                                         kprintf("chain inode trans "
1012                                                 "away from dd\n");
1013                                         bzero(&chain->data->ipdata.u,
1014                                               sizeof(chain->data->ipdata.u));
1015                                 }
1016                                 bcopy(focus->data, chain->data,
1017                                       offsetof(hammer2_inode_data_t, u));
1018                                 /* XXX setcheck on inode should not be needed */
1019                                 hammer2_chain_setcheck(chain, chain->data);
1020                                 break;
1021                         }
1022                         /* fall through */
1023                 case HAMMER2_BREF_TYPE_DATA:
1024                         bcopy(focus->data, chain->data, chain->bytes);
1025                         hammer2_chain_setcheck(chain, chain->data);
1026                         break;
1027                 case HAMMER2_BREF_TYPE_DIRENT:
1028                         /*
1029                          * Directory entries embed data in the blockref.
1030                          */
1031                         if (chain->bytes) {
1032                                 bcopy(focus->data, chain->data, chain->bytes);
1033                                 hammer2_chain_setcheck(chain, chain->data);
1034                         } else {
1035                                 chain->bref.check = focus->bref.check;
1036                         }
1037                         chain->bref.embed = focus->bref.embed;
1038                         break;
1039                 default:
1040                         KKASSERT(0);
1041                         break;
1042                 }
1043         }
1044
1045 failed:
1046         hammer2_chain_unlock(chain);
1047         hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1048                                   HAMMER2_RESOLVE_MAYBE);
1049
1050         return error;
1051 }