hammer2 - refactor filesystem sync 1/N
[dragonfly.git] / sys / vfs / hammer2 / hammer2_synchro.c
1 /*
2  * Copyright (c) 2015-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * This module implements the cluster synchronizer.  Basically the way
36  * it works is that a thread is created for each cluster node in a PFS.
37  * This thread is responsible for synchronizing the current node using
38  * data from other nodes.
39  *
40  * Any out of sync master or slave can get back into synchronization as
41  * long as a quorum of masters agree on the update_tid.  If a quorum is
42  * not available it may still be possible to synchronize to the highest
43  * available update_tid as a way of trying to catch up as much as possible
44  * until a quorum is available.
45  *
46  * If no quorum is possible (which can happen even if all masters are
47  * available, if the update_tid does not match), then manual intervention
48  * may be required to resolve discrepancies.
49  */
50 #include "hammer2.h"
51
52 typedef struct hammer2_deferred_ip {
53         struct hammer2_deferred_ip *next;
54         hammer2_inode_t *ip;
55 } hammer2_deferred_ip_t;
56
57 typedef struct hammer2_deferred_list {
58         hammer2_deferred_ip_t   *base;
59         int                     count;
60 } hammer2_deferred_list_t;
61
62
63 #define HAMMER2_SYNCHRO_DEBUG 1
64
65 static int hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
66                                 hammer2_deferred_list_t *list, int isroot);
67 #if 0
68 static void hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags);
69                                 nerror = hammer2_sync_insert(
70                                                 thr, &parent, &chain,
71                                                 focus->bref.modify_tid,
72                                                 idx, focus);
73 #endif
74 static int hammer2_sync_insert(hammer2_thread_t *thr,
75                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
76                         hammer2_tid_t modify_tid, int idx,
77                         hammer2_xop_head_t *xop, hammer2_chain_t *focus);
78 static int hammer2_sync_destroy(hammer2_thread_t *thr,
79                         hammer2_chain_t **parentp, hammer2_chain_t **chainp,
80                         hammer2_tid_t mtid, int idx);
81 static int hammer2_sync_replace(hammer2_thread_t *thr,
82                         hammer2_chain_t *parent, hammer2_chain_t *chain,
83                         hammer2_tid_t mtid, int idx,
84                         hammer2_xop_head_t *xop, hammer2_chain_t *focus,
85                         int isroot);
86
87 /****************************************************************************
88  *                          HAMMER2 SYNC THREADS                            *
89  ****************************************************************************/
90 /*
91  * Primary management thread for an element of a node.  A thread will exist
92  * for each element requiring management.
93  *
94  * No management threads are needed for the SPMP or for any PMP with only
95  * a single MASTER.
96  *
97  * On the SPMP - handles bulkfree and dedup operations
98  * On a PFS    - handles remastering and synchronization
99  */
100 void
101 hammer2_primary_sync_thread(void *arg)
102 {
103         hammer2_thread_t *thr = arg;
104         hammer2_pfs_t *pmp;
105         hammer2_deferred_list_t list;
106         hammer2_deferred_ip_t *defer;
107         int error;
108         uint32_t flags;
109         uint32_t nflags;
110
111         pmp = thr->pmp;
112         bzero(&list, sizeof(list));
113
114         for (;;) {
115                 flags = thr->flags;
116                 cpu_ccfence();
117
118                 /*
119                  * Handle stop request
120                  */
121                 if (flags & HAMMER2_THREAD_STOP)
122                         break;
123
124                 /*
125                  * Handle freeze request
126                  */
127                 if (flags & HAMMER2_THREAD_FREEZE) {
128                         nflags = (flags & ~(HAMMER2_THREAD_FREEZE |
129                                             HAMMER2_THREAD_WAITING)) |
130                                  HAMMER2_THREAD_FROZEN;
131                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
132                                 continue;
133                         if (flags & HAMMER2_THREAD_WAITING)
134                                 wakeup(&thr->flags);
135                         continue;
136                 }
137
138                 if (flags & HAMMER2_THREAD_UNFREEZE) {
139                         nflags = flags & ~(HAMMER2_THREAD_UNFREEZE |
140                                            HAMMER2_THREAD_FROZEN |
141                                            HAMMER2_THREAD_WAITING);
142                         if (!atomic_cmpset_int(&thr->flags, flags, nflags))
143                                 continue;
144                         if (flags & HAMMER2_THREAD_WAITING)
145                                 wakeup(&thr->flags);
146                         continue;
147                 }
148
149                 /*
150                  * Force idle if frozen until unfrozen or stopped.
151                  */
152                 if (flags & HAMMER2_THREAD_FROZEN) {
153                         nflags = flags | HAMMER2_THREAD_WAITING;
154
155                         tsleep_interlock(&thr->flags, 0);
156                         if (atomic_cmpset_int(&thr->flags, flags, nflags))
157                                 tsleep(&thr->flags, PINTERLOCKED, "frozen", 0);
158                         continue;
159                 }
160
161                 /*
162                  * Reset state on REMASTER request
163                  */
164                 if (thr->flags & HAMMER2_THREAD_REMASTER) {
165                         nflags = flags & ~HAMMER2_THREAD_REMASTER;
166                         if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
167                                 /* reset state here */
168                         }
169                         continue;
170                 }
171
172                 /*
173                  * Synchronization scan.
174                  */
175                 if (hammer2_debug & 0x8000)
176                         kprintf("sync_slaves pfs %s clindex %d\n",
177                                 pmp->pfs_names[thr->clindex], thr->clindex);
178                 hammer2_trans_init(pmp, 0);
179
180                 hammer2_inode_ref(pmp->iroot);
181
182                 for (;;) {
183                         int didbreak = 0;
184                         /* XXX lock synchronize pmp->modify_tid */
185                         error = hammer2_sync_slaves(thr, pmp->iroot, &list, 1);
186                         if (hammer2_debug & 0x8000) {
187                                 kprintf("sync_slaves error %d defer %p\n",
188                                         error, list.base);
189                         }
190                         if (error != HAMMER2_ERROR_EAGAIN)
191                                 break;
192                         while ((defer = list.base) != NULL) {
193                                 hammer2_inode_t *nip;
194
195                                 nip = defer->ip;
196                                 error = hammer2_sync_slaves(thr, nip, &list,
197                                                         (nip == pmp->iroot));
198                                 if (error &&
199                                     error != HAMMER2_ERROR_EAGAIN &&
200                                     error != HAMMER2_ERROR_ENOENT) {
201                                         break;
202                                 }
203                                 if (hammer2_thr_break(thr)) {
204                                         didbreak = 1;
205                                         break;
206                                 }
207
208                                 /*
209                                  * If no additional defers occurred we can
210                                  * remove this one, otherwise keep it on
211                                  * the list and retry once the additional
212                                  * defers have completed.
213                                  */
214                                 if (defer == list.base) {
215                                         --list.count;
216                                         list.base = defer->next;
217                                         kfree(defer, M_HAMMER2);
218                                         defer = NULL;   /* safety */
219                                         hammer2_inode_drop(nip);
220                                 }
221                         }
222
223                         /*
224                          * If the thread is being remastered, frozen, or
225                          * stopped, clean up any left-over deferals.
226                          */
227                         if (didbreak ||
228                             (error && error != HAMMER2_ERROR_EAGAIN)) {
229                                 kprintf("didbreak\n");
230                                 while ((defer = list.base) != NULL) {
231                                         --list.count;
232                                         hammer2_inode_drop(defer->ip);
233                                         list.base = defer->next;
234                                         kfree(defer, M_HAMMER2);
235                                 }
236                                 if (error == 0 || error == HAMMER2_ERROR_EAGAIN)
237                                         error = HAMMER2_ERROR_EINPROGRESS;
238                                 break;
239                         }
240                 }
241
242                 hammer2_inode_drop(pmp->iroot);
243                 hammer2_trans_done(pmp, 0);
244
245                 if (error && error != HAMMER2_ERROR_EINPROGRESS)
246                         kprintf("hammer2_sync_slaves: error %d\n", error);
247
248                 /*
249                  * Wait for event, or 5-second poll.
250                  */
251                 nflags = flags | HAMMER2_THREAD_WAITING;
252                 tsleep_interlock(&thr->flags, 0);
253                 if (atomic_cmpset_int(&thr->flags, flags, nflags)) {
254                         tsleep(&thr->flags, 0, "h2idle", hz * 5);
255                 }
256         }
257         thr->td = NULL;
258         hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
259         /* thr structure can go invalid after this point */
260 }
261
262 #if 0
263 /*
264  * Given a locked cluster created from pmp->iroot, update the PFS's
265  * reporting status.
266  */
267 static
268 void
269 hammer2_update_pfs_status(hammer2_thread_t *thr, uint32_t flags)
270 {
271         hammer2_pfs_t *pmp = thr->pmp;
272
273         flags &= HAMMER2_CLUSTER_ZFLAGS;
274         if (pmp->cluster_flags == flags)
275                 return;
276         pmp->cluster_flags = flags;
277
278         kprintf("pfs %p", pmp);
279         if (flags & HAMMER2_CLUSTER_MSYNCED)
280                 kprintf(" masters-all-good");
281         if (flags & HAMMER2_CLUSTER_SSYNCED)
282                 kprintf(" slaves-all-good");
283
284         if (flags & HAMMER2_CLUSTER_WRHARD)
285                 kprintf(" quorum/rw");
286         else if (flags & HAMMER2_CLUSTER_RDHARD)
287                 kprintf(" quorum/ro");
288
289         if (flags & HAMMER2_CLUSTER_UNHARD)
290                 kprintf(" out-of-sync-masters");
291         else if (flags & HAMMER2_CLUSTER_NOHARD)
292                 kprintf(" no-masters-visible");
293
294         if (flags & HAMMER2_CLUSTER_WRSOFT)
295                 kprintf(" soft/rw");
296         else if (flags & HAMMER2_CLUSTER_RDSOFT)
297                 kprintf(" soft/ro");
298
299         if (flags & HAMMER2_CLUSTER_UNSOFT)
300                 kprintf(" out-of-sync-slaves");
301         else if (flags & HAMMER2_CLUSTER_NOSOFT)
302                 kprintf(" no-slaves-visible");
303         kprintf("\n");
304 }
305 #endif
306
307 #if 0
308 static
309 void
310 dumpcluster(const char *label,
311             hammer2_cluster_t *cparent, hammer2_cluster_t *cluster)
312 {
313         hammer2_chain_t *chain;
314         int i;
315
316         if ((hammer2_debug & 1) == 0)
317                 return;
318
319         kprintf("%s\t", label);
320         KKASSERT(cparent->nchains == cluster->nchains);
321         for (i = 0; i < cparent->nchains; ++i) {
322                 if (i)
323                         kprintf("\t");
324                 kprintf("%d ", i);
325                 if ((chain = cparent->array[i].chain) != NULL) {
326                         kprintf("%016jx%s ",
327                                 chain->bref.key,
328                                 ((cparent->array[i].flags &
329                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
330                         );
331                 } else {
332                         kprintf("      NULL      %s ", "   ");
333                 }
334                 if ((chain = cluster->array[i].chain) != NULL) {
335                         kprintf("%016jx%s ",
336                                 chain->bref.key,
337                                 ((cluster->array[i].flags &
338                                   HAMMER2_CITEM_INVALID) ? "(I)" : "   ")
339                         );
340                 } else {
341                         kprintf("      NULL      %s ", "   ");
342                 }
343                 kprintf("\n");
344         }
345 }
346 #endif
347
348 /*
349  * Each out of sync node sync-thread must issue an all-nodes XOP scan of
350  * the inode.  This creates a multiplication effect since the XOP scan itself
351  * issues to all nodes.  However, this is the only way we can safely
352  * synchronize nodes which might have disparate I/O bandwidths and the only
353  * way we can safely deal with stalled nodes.
354  *
355  * XXX serror / merror rollup and handling.
356  */
357 static
358 int
359 hammer2_sync_slaves(hammer2_thread_t *thr, hammer2_inode_t *ip,
360                     hammer2_deferred_list_t *list, int isroot)
361 {
362         hammer2_xop_scanall_t *xop;
363         hammer2_chain_t *parent;
364         hammer2_chain_t *chain;
365         hammer2_pfs_t *pmp;
366         hammer2_key_t key_next;
367         hammer2_tid_t sync_tid;
368         int needrescan;
369         int want_update;
370         int serror;             /* slave error */
371         int merror;             /* master error (from xop_collect) */
372         int nerror;             /* temporary error */
373         int idx;
374         int n;
375
376         pmp = ip->pmp;
377         idx = thr->clindex;     /* cluster node we are responsible for */
378         needrescan = 0;
379         want_update = 0;
380         sync_tid = 0;
381         chain = NULL;
382         parent = NULL;
383
384 #if 0
385         /*
386          * Nothing to do if all slaves are synchronized.
387          * Nothing to do if cluster not authoritatively readable.
388          */
389         if (pmp->cluster_flags & HAMMER2_CLUSTER_SSYNCED)
390                 return(0);
391         if ((pmp->cluster_flags & HAMMER2_CLUSTER_RDHARD) == 0)
392                 return(HAMMER2_ERROR_INCOMPLETE);
393 #endif
394
395         merror = 0;
396
397         /*
398          * Resolve the root inode of the PFS and determine if synchronization
399          * is needed by checking modify_tid.
400          *
401          * Retain the synchronization TID from the focus inode and use it
402          * later to synchronize the focus inode if/when the recursion
403          * succeeds.
404          */
405         {
406                 hammer2_xop_ipcluster_t *xop2;
407                 hammer2_chain_t *focus;
408
409                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
410                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
411                 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
412                                          idx);
413                 hammer2_inode_unlock(ip);
414                 merror = hammer2_xop_collect(&xop2->head, 0);
415                 if (merror == 0 && (focus = xop2->head.cluster.focus) != NULL) {
416                         sync_tid = focus->bref.modify_tid;
417                         chain = hammer2_inode_chain_and_parent(ip, idx,
418                                                     &parent,
419                                                     HAMMER2_RESOLVE_ALWAYS |
420                                                     HAMMER2_RESOLVE_SHARED);
421                         want_update = (chain->bref.modify_tid != sync_tid);
422                         if (chain) {
423                                 hammer2_chain_unlock(chain);
424                                 hammer2_chain_drop(chain);
425                                 chain = NULL;
426                         }
427                         if (parent) {
428                                 hammer2_chain_unlock(parent);
429                                 hammer2_chain_drop(parent);
430                                 parent = NULL;
431                         }
432                 }
433                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
434         }
435
436         if (want_update == 0)
437                 return(0);
438
439         /*
440          * The inode is left unlocked during the scan.  Issue a XOP
441          * that does *not* include our cluster index to iterate
442          * properly synchronized elements and resolve our cluster index
443          * against it.
444          */
445         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
446         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
447         xop->key_beg = HAMMER2_KEY_MIN;
448         xop->key_end = HAMMER2_KEY_MAX;
449         xop->resolve_flags = HAMMER2_RESOLVE_SHARED |
450                              HAMMER2_RESOLVE_ALWAYS;
451         xop->lookup_flags = HAMMER2_LOOKUP_SHARED |
452                             HAMMER2_LOOKUP_NODIRECT |
453                             HAMMER2_LOOKUP_ALWAYS;
454         hammer2_xop_start_except(&xop->head, &hammer2_scanall_desc, idx);
455         parent = hammer2_inode_chain(ip, idx,
456                                      HAMMER2_RESOLVE_ALWAYS |
457                                      HAMMER2_RESOLVE_SHARED);
458         hammer2_inode_unlock(ip);
459
460         chain = hammer2_chain_lookup(&parent, &key_next,
461                                      HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
462                                      &serror,
463                                      HAMMER2_LOOKUP_SHARED |
464                                      HAMMER2_LOOKUP_NODIRECT |
465                                      HAMMER2_LOOKUP_NODATA);
466         merror = hammer2_xop_collect(&xop->head, 0);
467         if (hammer2_debug & 0x8000) {
468                 kprintf("START_SCAN IP=%016jx chain=%p (%016jx)\n",
469                         ip->meta.name_key, chain,
470                         (chain ? chain->bref.key : -1));
471         }
472
473         for (;;) {
474                 /*
475                  * We are done if our scan is done and the XOP scan is done.
476                  * We are done if the XOP scan failed (that is, we don't
477                  * have authoritative data to synchronize with).
478                  */
479                 int advance_local = 0;
480                 int advance_xop = 0;
481                 int dodefer = 0;
482                 hammer2_chain_t *focus;
483
484                 if (chain == NULL && merror == HAMMER2_ERROR_ENOENT)
485                         break;
486                 if (merror && merror != HAMMER2_ERROR_ENOENT)
487                         break;
488
489                 /*
490                  * Compare
491                  */
492                 if (chain && merror == HAMMER2_ERROR_ENOENT) {
493                         /*
494                          * If we have local chains but the XOP scan is done,
495                          * the chains need to be deleted.
496                          */
497                         n = -1;
498                         focus = NULL;
499                 } else if (chain == NULL) {
500                         /*
501                          * If our local scan is done but the XOP scan is not,
502                          * we need to create the missing chain(s).
503                          */
504                         n = 1;
505                         focus = xop->head.cluster.focus;
506                 } else {
507                         /*
508                          * Otherwise compare to determine the action
509                          * needed.
510                          */
511                         focus = xop->head.cluster.focus;
512                         n = hammer2_chain_cmp(chain, focus);
513                 }
514
515                 /*
516                  * Take action based on comparison results.
517                  */
518                 if (n < 0) {
519                         /*
520                          * Delete extranious local data.  This will
521                          * automatically advance the chain.
522                          */
523                         nerror = hammer2_sync_destroy(thr, &parent, &chain,
524                                                       0, idx);
525                 } else if (n == 0 && chain->bref.modify_tid !=
526                                      focus->bref.modify_tid) {
527                         /*
528                          * Matching key but local data or meta-data requires
529                          * updating.  If we will recurse, we still need to
530                          * update to compatible content first but we do not
531                          * synchronize modify_tid until the entire recursion
532                          * has completed successfully.
533                          */
534                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
535                                 nerror = hammer2_sync_replace(
536                                                 thr, parent, chain,
537                                                 0,
538                                                 idx, &xop->head, focus, 0);
539                                 dodefer = 1;
540                         } else {
541                                 nerror = hammer2_sync_replace(
542                                                 thr, parent, chain,
543                                                 focus->bref.modify_tid,
544                                                 idx, &xop->head, focus, 0);
545                         }
546                         advance_local = 1;
547                         advance_xop = 1;
548                 } else if (n == 0) {
549                         /*
550                          * 100% match, advance both
551                          */
552                         advance_local = 1;
553                         advance_xop = 1;
554                         nerror = 0;
555                 } else if (n > 0) {
556                         /*
557                          * Insert missing local data.
558                          *
559                          * If we will recurse, we still need to update to
560                          * compatible content first but we do not synchronize
561                          * modify_tid until the entire recursion has
562                          * completed successfully.
563                          */
564                         if (focus->bref.type == HAMMER2_BREF_TYPE_INODE) {
565                                 nerror = hammer2_sync_insert(
566                                                 thr, &parent, &chain,
567                                                 0,
568                                                 idx, &xop->head, focus);
569                                 dodefer = 2;
570                         } else {
571                                 nerror = hammer2_sync_insert(
572                                                 thr, &parent, &chain,
573                                                 focus->bref.modify_tid,
574                                                 idx, &xop->head, focus);
575                         }
576                         advance_local = 1;
577                         advance_xop = 1;
578                 }
579
580                 /*
581                  * We cannot recurse depth-first because the XOP is still
582                  * running in node threads for this scan.  Create a placemarker
583                  * by obtaining and record the hammer2_inode.
584                  *
585                  * We excluded our node from the XOP so we must temporarily
586                  * add it to xop->head.cluster so it is properly incorporated
587                  * into the inode.
588                  *
589                  * The deferral is pushed onto a LIFO list for bottom-up
590                  * synchronization.
591                  */
592                 if (merror == 0 && dodefer) {
593                         hammer2_inode_t *nip;
594                         hammer2_deferred_ip_t *defer;
595
596                         KKASSERT(focus->bref.type == HAMMER2_BREF_TYPE_INODE);
597
598                         defer = kmalloc(sizeof(*defer), M_HAMMER2,
599                                         M_WAITOK | M_ZERO);
600                         KKASSERT(xop->head.cluster.array[idx].chain == NULL);
601                         xop->head.cluster.array[idx].flags =
602                                                         HAMMER2_CITEM_INVALID;
603                         xop->head.cluster.array[idx].chain = chain;
604                         nip = hammer2_inode_get(pmp, &xop->head, -1, idx);
605                         xop->head.cluster.array[idx].chain = NULL;
606
607                         hammer2_inode_ref(nip);
608                         hammer2_inode_unlock(nip);
609
610                         defer->next = list->base;
611                         defer->ip = nip;
612                         list->base = defer;
613                         ++list->count;
614                         needrescan = 1;
615                 }
616
617                 /*
618                  * If at least one deferral was added and the deferral
619                  * list has grown too large, stop adding more.  This
620                  * will trigger an HAMMER2_ERROR_EAGAIN return.
621                  */
622                 if (needrescan && list->count > 1000)
623                         break;
624
625                 /*
626                  * Advancements for iteration.
627                  */
628                 if (advance_xop) {
629                         merror = hammer2_xop_collect(&xop->head, 0);
630                 }
631                 if (advance_local) {
632                         chain = hammer2_chain_next(&parent, chain, &key_next,
633                                                    key_next, HAMMER2_KEY_MAX,
634                                                    &serror,
635                                                    HAMMER2_LOOKUP_SHARED |
636                                                    HAMMER2_LOOKUP_NODIRECT |
637                                                    HAMMER2_LOOKUP_NODATA);
638                 }
639         }
640         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
641         if (chain) {
642                 hammer2_chain_unlock(chain);
643                 hammer2_chain_drop(chain);
644         }
645         if (parent) {
646                 hammer2_chain_unlock(parent);
647                 hammer2_chain_drop(parent);
648         }
649
650         /*
651          * If we added deferrals we want the caller to synchronize them
652          * and then call us again.
653          *
654          * NOTE: In this situation we do not yet want to synchronize our
655          *       inode, setting the error code also has that effect.
656          */
657         if ((merror == 0 || merror == HAMMER2_ERROR_ENOENT) && needrescan)
658                 merror = HAMMER2_ERROR_EAGAIN;
659
660         /*
661          * If no error occurred we can synchronize the inode meta-data
662          * and modify_tid.  Only limited changes are made to PFSROOTs.
663          *
664          * XXX inode lock was lost
665          */
666         if (merror == 0 || merror == HAMMER2_ERROR_ENOENT) {
667                 hammer2_xop_ipcluster_t *xop2;
668                 hammer2_chain_t *focus;
669
670                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
671                 xop2 = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
672                 hammer2_xop_start_except(&xop2->head, &hammer2_ipcluster_desc,
673                                          idx);
674                 hammer2_inode_unlock(ip);
675                 merror = hammer2_xop_collect(&xop2->head, 0);
676                 if (merror == 0) {
677                         focus = xop2->head.cluster.focus;
678                         if ((hammer2_debug & 0x8000) && focus) {
679                                 const char *filename;
680
681                                 filename = hammer2_xop_gdata(&xop2->head)->
682                                                 ipdata.filename;
683                                 kprintf("syncthr: update inode %p (%s)\n",
684                                         focus, filename);
685                                 hammer2_xop_pdata(&xop2->head);
686                         }
687                         chain = hammer2_inode_chain_and_parent(ip, idx,
688                                                     &parent,
689                                                     HAMMER2_RESOLVE_ALWAYS |
690                                                     HAMMER2_RESOLVE_SHARED);
691
692                         KKASSERT(parent != NULL);
693                         nerror = hammer2_sync_replace(
694                                         thr, parent, chain,
695                                         sync_tid,
696                                         idx, &xop2->head, focus, isroot);
697                         hammer2_chain_unlock(chain);
698                         hammer2_chain_drop(chain);
699                         hammer2_chain_unlock(parent);
700                         hammer2_chain_drop(parent);
701                         /* XXX */
702                 }
703                 hammer2_xop_retire(&xop2->head, HAMMER2_XOPMASK_VOP);
704         }
705
706         return merror;
707 }
708
709 /*
710  * Create a missing chain by copying the focus from another device.
711  *
712  * On entry *parentp and focus are both locked shared.  The chain will be
713  * created and returned in *chainp also locked shared.
714  */
715 static
716 int
717 hammer2_sync_insert(hammer2_thread_t *thr,
718                     hammer2_chain_t **parentp, hammer2_chain_t **chainp,
719                     hammer2_tid_t mtid, int idx, hammer2_xop_head_t *xop,
720                     hammer2_chain_t *focus)
721 {
722         hammer2_chain_t *chain;
723         hammer2_key_t dummy;
724         int error;
725
726 #if HAMMER2_SYNCHRO_DEBUG
727         if (hammer2_debug & 1)
728         kprintf("insert rec par=%p/%d.%016jx slave %d %d.%016jx mod=%016jx\n",
729                 *parentp, 
730                 (*parentp)->bref.type,
731                 (*parentp)->bref.key,
732                 idx,
733                 focus->bref.type, focus->bref.key, mtid);
734 #endif
735
736         /*
737          * Parent requires an exclusive lock for the insertion.
738          * We must unlock the child to avoid deadlocks while
739          * relocking the parent.
740          */
741         if (*chainp) {
742                 hammer2_chain_unlock(*chainp);
743                 hammer2_chain_drop(*chainp);
744                 *chainp = NULL;
745         }
746         hammer2_chain_unlock(*parentp);
747         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
748
749         /*
750          * We must reissue the lookup to properly position (*parentp)
751          * for the insertion.
752          */
753         chain = hammer2_chain_lookup(parentp, &dummy,
754                                      focus->bref.key, focus->bref.key,
755                                      &error,
756                                      HAMMER2_LOOKUP_NODIRECT |
757                                      HAMMER2_LOOKUP_ALWAYS);
758         KKASSERT(chain == NULL);
759
760         chain = NULL;
761         error = hammer2_chain_create(parentp, &chain,
762                                      thr->pmp, focus->bref.methods,
763                                      focus->bref.key, focus->bref.keybits,
764                                      focus->bref.type, focus->bytes,
765                                      mtid, 0, 0);
766         if (error == 0) {
767                 const hammer2_media_data_t *data;
768
769                 error = hammer2_chain_modify(chain, mtid, 0, 0);
770                 if (error)
771                         goto failed;
772
773                 /*
774                  * Copy focus to new chain
775                  */
776
777                 /* type already set */
778                 chain->bref.methods = focus->bref.methods;
779                 /* keybits already set */
780                 chain->bref.vradix = focus->bref.vradix;
781                 /* mirror_tid set by flush */
782                 KKASSERT(chain->bref.modify_tid == mtid);
783                 chain->bref.flags = focus->bref.flags;
784                 /* key already present */
785                 /* check code will be recalculated */
786
787                 /*
788                  * Copy data body.
789                  */
790                 switch(chain->bref.type) {
791                 case HAMMER2_BREF_TYPE_INODE:
792                         data = hammer2_xop_gdata(xop);
793
794                         if ((data->ipdata.meta.op_flags &
795                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
796                                 /* do not copy block table */
797                                 bcopy(data, chain->data,
798                                       offsetof(hammer2_inode_data_t, u));
799                                 hammer2_xop_pdata(xop);
800                                 break;
801                         }
802                         hammer2_xop_pdata(xop);
803                         /* fall through copy whole thing */
804                 case HAMMER2_BREF_TYPE_DATA:
805                         data = hammer2_xop_gdata(xop);
806                         bcopy(data, chain->data, chain->bytes);
807                         hammer2_chain_setcheck(chain, chain->data);
808                         hammer2_xop_pdata(xop);
809                         break;
810                 case HAMMER2_BREF_TYPE_DIRENT:
811                         /*
812                          * Directory entries embed data in the blockref.
813                          */
814                         if (chain->bytes) {
815                                 data = hammer2_xop_gdata(xop);
816                                 bcopy(data, chain->data, chain->bytes);
817                                 hammer2_chain_setcheck(chain, chain->data);
818                                 hammer2_xop_pdata(xop);
819                         } else {
820                                 chain->bref.check = focus->bref.check;
821                         }
822                         chain->bref.embed = focus->bref.embed;
823                         break;
824                 default:
825                         KKASSERT(0);
826                         break;
827                 }
828         }
829
830 failed:
831         if (chain)
832                 hammer2_chain_unlock(chain);    /* unlock, leave ref */
833         *chainp = chain;                        /* will be returned locked */
834
835         /*
836          * Avoid an ordering deadlock when relocking shared.
837          */
838         hammer2_chain_unlock(*parentp);
839         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
840                                      HAMMER2_RESOLVE_ALWAYS);
841         if (chain) {
842                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
843                                           HAMMER2_RESOLVE_ALWAYS);
844                 error = chain->error;
845         }
846
847         return error;
848 }
849
850 /*
851  * Destroy an extranious chain.
852  *
853  * Both *parentp and *chainp are locked shared.
854  *
855  * On return, *chainp will be adjusted to point to the next element in the
856  * iteration and locked shared.
857  */
858 static
859 int
860 hammer2_sync_destroy(hammer2_thread_t *thr,
861                      hammer2_chain_t **parentp, hammer2_chain_t **chainp,
862                      hammer2_tid_t mtid, int idx)
863 {
864         hammer2_chain_t *chain;
865         hammer2_key_t key_next;
866         hammer2_key_t save_key;
867         int error;
868
869         chain = *chainp;
870
871 #if HAMMER2_SYNCHRO_DEBUG
872         if (hammer2_debug & 1)
873         kprintf("destroy rec %p/%p slave %d %d.%016jx\n",
874                 *parentp, chain,
875                 idx, chain->bref.type, chain->bref.key);
876 #endif
877
878         save_key = chain->bref.key;
879         if (save_key != HAMMER2_KEY_MAX)
880                 ++save_key;
881
882         /*
883          * Try to avoid unnecessary I/O.
884          *
885          * XXX accounting not propagated up properly.  We might have to do
886          *     a RESOLVE_MAYBE here and pass 0 for the flags.
887          */
888         hammer2_chain_unlock(chain);    /* relock exclusive */
889         hammer2_chain_unlock(*parentp);
890         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_ALWAYS);
891         hammer2_chain_lock(chain, HAMMER2_RESOLVE_NEVER);
892
893         hammer2_chain_delete(*parentp, chain, mtid, HAMMER2_DELETE_PERMANENT);
894         hammer2_chain_unlock(chain);
895         hammer2_chain_drop(chain);
896         chain = NULL;                   /* safety */
897
898         hammer2_chain_unlock(*parentp); /* relock shared */
899         hammer2_chain_lock(*parentp, HAMMER2_RESOLVE_SHARED |
900                                      HAMMER2_RESOLVE_ALWAYS);
901         *chainp = hammer2_chain_lookup(parentp, &key_next,
902                                      save_key, HAMMER2_KEY_MAX,
903                                      &error,
904                                      HAMMER2_LOOKUP_SHARED |
905                                      HAMMER2_LOOKUP_NODIRECT |
906                                      HAMMER2_LOOKUP_NODATA);
907         return error;
908 }
909
910 /*
911  * cparent is locked exclusively, with an extra ref, cluster is not locked.
912  * Replace element [i] in the cluster.
913  */
914 static
915 int
916 hammer2_sync_replace(hammer2_thread_t *thr,
917                      hammer2_chain_t *parent, hammer2_chain_t *chain,
918                      hammer2_tid_t mtid, int idx,
919                      hammer2_xop_head_t *xop, hammer2_chain_t *focus,
920                      int isroot)
921 {
922         uint8_t otype;
923         int nradix;
924         int error;
925
926 #if HAMMER2_SYNCHRO_DEBUG
927         if (hammer2_debug & 1)
928         kprintf("replace rec %p slave %d %d.%016jx mod=%016jx\n",
929                 chain,
930                 idx,
931                 focus->bref.type, focus->bref.key, mtid);
932 #endif
933         hammer2_chain_unlock(chain);
934         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
935         error = chain->error;
936         if (error == 0) {
937                 const hammer2_media_data_t *data;
938
939                 if (chain->bytes != focus->bytes) {
940                         /* XXX what if compressed? */
941                         nradix = hammer2_getradix(chain->bytes);
942                         error = hammer2_chain_resize(chain, mtid, 0, nradix, 0);
943                         if (error)
944                                 goto failed;
945                 }
946                 error = hammer2_chain_modify(chain, mtid, 0, 0);
947                 if (error)
948                         goto failed;
949                 otype = chain->bref.type;
950                 data = hammer2_xop_gdata(xop);
951                 chain->bref.type = focus->bref.type;
952                 chain->bref.methods = focus->bref.methods;
953                 chain->bref.keybits = focus->bref.keybits;
954                 chain->bref.vradix = focus->bref.vradix;
955                 /* mirror_tid updated by flush */
956                 KKASSERT(mtid == 0 || chain->bref.modify_tid == mtid);
957                 chain->bref.flags = focus->bref.flags;
958                 /* key already present */
959                 /* check code will be recalculated */
960
961                 /*
962                  * Copy data body.
963                  */
964                 switch(chain->bref.type) {
965                 case HAMMER2_BREF_TYPE_INODE:
966                         /*
967                          * Special case PFSROOTs, only limited changes can
968                          * be made since the meta-data contains miscellanious
969                          * distinguishing fields.
970                          */
971                         if (isroot) {
972                                 chain->data->ipdata.meta.uflags =
973                                         data->ipdata.meta.uflags;
974                                 chain->data->ipdata.meta.rmajor =
975                                         data->ipdata.meta.rmajor;
976                                 chain->data->ipdata.meta.rminor =
977                                         data->ipdata.meta.rminor;
978                                 chain->data->ipdata.meta.ctime =
979                                         data->ipdata.meta.ctime;
980                                 chain->data->ipdata.meta.mtime =
981                                         data->ipdata.meta.mtime;
982                                 chain->data->ipdata.meta.atime =
983                                         data->ipdata.meta.atime;
984                                 /* not btime */
985                                 chain->data->ipdata.meta.uid =
986                                         data->ipdata.meta.uid;
987                                 chain->data->ipdata.meta.gid =
988                                         data->ipdata.meta.gid;
989                                 chain->data->ipdata.meta.mode =
990                                         data->ipdata.meta.mode;
991                                 chain->data->ipdata.meta.ncopies =
992                                         data->ipdata.meta.ncopies;
993                                 chain->data->ipdata.meta.comp_algo =
994                                         data->ipdata.meta.comp_algo;
995                                 chain->data->ipdata.meta.check_algo =
996                                         data->ipdata.meta.check_algo;
997                                 chain->data->ipdata.meta.data_quota =
998                                         data->ipdata.meta.data_quota;
999                                 chain->data->ipdata.meta.inode_quota =
1000                                         data->ipdata.meta.inode_quota;
1001
1002                                 /*
1003                                  * last snapshot tid controls overwrite
1004                                  */
1005                                 if (chain->data->ipdata.meta.pfs_lsnap_tid <
1006                                     data->ipdata.meta.pfs_lsnap_tid) {
1007                                         chain->data->ipdata.meta.pfs_lsnap_tid =
1008                                         data->ipdata.meta.pfs_lsnap_tid;
1009                                 }
1010
1011                                 hammer2_chain_setcheck(chain, chain->data);
1012                                 break;
1013                         }
1014
1015                         /*
1016                          * Normal replacement.
1017                          */
1018                         if ((data->ipdata.meta.op_flags &
1019                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1020                                 /*
1021                                  * If DIRECTDATA is transitioning to 0 or the
1022                                  * old chain is not an inode we have to
1023                                  * initialize the block table.
1024                                  */
1025                                 if (otype != HAMMER2_BREF_TYPE_INODE ||
1026                                     (chain->data->ipdata.meta.op_flags &
1027                                      HAMMER2_OPFLAG_DIRECTDATA)) {
1028                                         kprintf("chain inode trans "
1029                                                 "away from dd\n");
1030                                         bzero(&chain->data->ipdata.u,
1031                                               sizeof(chain->data->ipdata.u));
1032                                 }
1033                                 bcopy(data, chain->data,
1034                                       offsetof(hammer2_inode_data_t, u));
1035                                 /* XXX setcheck on inode should not be needed */
1036                                 hammer2_chain_setcheck(chain, chain->data);
1037                                 break;
1038                         }
1039                         /* fall through */
1040                 case HAMMER2_BREF_TYPE_DATA:
1041                         bcopy(data, chain->data, chain->bytes);
1042                         hammer2_chain_setcheck(chain, chain->data);
1043                         break;
1044                 case HAMMER2_BREF_TYPE_DIRENT:
1045                         /*
1046                          * Directory entries embed data in the blockref.
1047                          */
1048                         if (chain->bytes) {
1049                                 bcopy(data, chain->data, chain->bytes);
1050                                 hammer2_chain_setcheck(chain, chain->data);
1051                         } else {
1052                                 chain->bref.check = focus->bref.check;
1053                         }
1054                         chain->bref.embed = focus->bref.embed;
1055                         break;
1056                 default:
1057                         KKASSERT(0);
1058                         break;
1059                 }
1060                 hammer2_xop_pdata(xop);
1061         }
1062
1063 failed:
1064         hammer2_chain_unlock(chain);
1065         hammer2_chain_lock(chain, HAMMER2_RESOLVE_SHARED |
1066                                   HAMMER2_RESOLVE_MAYBE);
1067
1068         return error;
1069 }