HAMMER 23/many: Recovery, B-Tree, spike, I/O work.
[dragonfly.git] / sys / vfs / hammer / hammer_cursor.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.15 2008/01/24 02:14:45 dillon Exp $
35  */
36
37 /*
38  * HAMMER B-Tree index - cursor support routines
39  */
40 #include "hammer.h"
41
42 static int hammer_load_cursor_parent(hammer_cursor_t cursor);
43 static int hammer_load_cursor_parent_local(hammer_cursor_t cursor);
44 static int hammer_load_cursor_parent_cluster(hammer_cursor_t cursor);
45
46 /*
47  * Initialize a fresh cursor using the B-Tree node cache.  If the cache
48  * is not available initialize a fresh cursor at the root of the root
49  * cluster.
50  */
51 int
52 hammer_init_cursor_hmp(hammer_cursor_t cursor, struct hammer_node **cache,
53                        hammer_mount_t hmp)
54 {
55         hammer_cluster_t cluster;
56         hammer_node_t node;
57         int error;
58
59         bzero(cursor, sizeof(*cursor));
60
61         /*
62          * Step 1 - acquire a locked node from the cache if possible
63          */
64         if (cache && *cache) {
65                 node = hammer_ref_node_safe(hmp, cache, &error);
66                 if (error == 0) {
67                         hammer_lock_sh(&node->lock);
68                         if (node->flags & HAMMER_NODE_DELETED) {
69                                 hammer_unlock(&node->lock);
70                                 hammer_rel_node(node);
71                                 node = NULL;
72                         }
73                 }
74         } else {
75                 node = NULL;
76         }
77
78         /*
79          * Step 2 - If we couldn't get a node from the cache, get
80          * the one from the root of the root cluster.
81          */
82         while (node == NULL) {
83                 cluster = hammer_get_root_cluster(hmp, &error);
84                 if (error)
85                         break;
86                 node = hammer_get_node(cluster,
87                                        cluster->ondisk->clu_btree_root,
88                                        &error);
89                 hammer_rel_cluster(cluster, 0);
90                 if (error)
91                         break;
92                 hammer_lock_sh(&node->lock);
93                 if (node->flags & HAMMER_NODE_DELETED) {
94                         hammer_unlock(&node->lock);
95                         hammer_rel_node(node);
96                         node = NULL;
97                 }
98         }
99
100         /*
101          * Step 3 - finish initializing the cursor by acquiring the parent
102          */
103         cursor->node = node;
104         if (error == 0)
105                 error = hammer_load_cursor_parent(cursor);
106         KKASSERT(error == 0);
107         return(error);
108 }
109
110 /*
111  * Initialize a fresh cursor at the root of the specified cluster and
112  * limit operations to within the cluster.
113  *
114  * NOTE: cursor->parent will be set to NULL to avoid referencing B-Tree
115  * nodes in other clusters.
116  */
117 int
118 hammer_init_cursor_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster)
119 {
120         int error;
121
122         bzero(cursor, sizeof(*cursor));
123         cursor->flags |= HAMMER_CURSOR_INCLUSTER;
124         cursor->node = hammer_get_node(cluster,
125                                        cluster->ondisk->clu_btree_root,
126                                        &error);
127         if (error == 0)
128                 hammer_lock_sh(&cursor->node->lock);
129         cursor->left_bound = &cluster->ondisk->clu_btree_beg;
130         cursor->right_bound = &cluster->ondisk->clu_btree_end;
131         KKASSERT(error == 0);
132         return(error);
133 }
134
135 /*
136  * We are finished with a cursor.  We NULL out various fields as sanity
137  * check, in case the structure is inappropriately used afterwords.
138  */
139 void
140 hammer_done_cursor(hammer_cursor_t cursor)
141 {
142         if (cursor->parent) {
143                 hammer_unlock(&cursor->parent->lock);
144                 hammer_rel_node(cursor->parent);
145                 cursor->parent = NULL;
146         }
147         if (cursor->node) {
148                 hammer_unlock(&cursor->node->lock);
149                 hammer_rel_node(cursor->node);
150                 cursor->node = NULL;
151         }
152         if (cursor->data_buffer) {
153                 hammer_rel_buffer(cursor->data_buffer, 0);
154                 cursor->data_buffer = NULL;
155         }
156         if (cursor->record_buffer) {
157                 hammer_rel_buffer(cursor->record_buffer, 0);
158                 cursor->record_buffer = NULL;
159         }
160         if (cursor->ip)
161                 hammer_mem_done(cursor);
162
163         /*
164          * If we deadlocked this node will be referenced.  Do a quick
165          * lock/unlock to wait for the deadlock condition to clear.
166          */
167         if (cursor->deadlk_node) {
168                 hammer_lock_ex(&cursor->deadlk_node->lock);
169                 hammer_unlock(&cursor->deadlk_node->lock);
170                 hammer_rel_node(cursor->deadlk_node);
171                 cursor->deadlk_node = NULL;
172         }
173
174         cursor->data = NULL;
175         cursor->record = NULL;
176         cursor->left_bound = NULL;
177         cursor->right_bound = NULL;
178 }
179
180 /*
181  * Upgrade cursor->node and cursor->parent to exclusive locks.  This
182  * function can return EDEADLK.
183  *
184  * If we fail to upgrade the lock and cursor->deadlk_node is NULL, 
185  * we add another reference to the node that failed and set
186  * cursor->deadlk_node so hammer_done_cursor() can block on it.
187  */
188 int
189 hammer_cursor_upgrade(hammer_cursor_t cursor)
190 {
191         int error;
192
193         if (hammer_lock_held(&cursor->node->lock) < 0) {
194                 error = hammer_lock_upgrade(&cursor->node->lock);
195                 if (error && cursor->deadlk_node == NULL) {
196                         cursor->deadlk_node = cursor->node;
197                         hammer_ref_node(cursor->deadlk_node);
198                 }
199         } else {
200                 error = 0;
201         }
202         if (cursor->parent && hammer_lock_held(&cursor->parent->lock) < 0) {
203                 error = hammer_lock_upgrade(&cursor->parent->lock);
204                 if (error && cursor->deadlk_node == NULL) {
205                         cursor->deadlk_node = cursor->parent;
206                         hammer_ref_node(cursor->deadlk_node);
207                 }
208         } else {
209                 error = 0;
210         }
211         return(error);
212 }
213
214 /*
215  * Downgrade cursor->node and cursor->parent to shared locks.  This
216  * function can return EDEADLK.
217  */
218 void
219 hammer_cursor_downgrade(hammer_cursor_t cursor)
220 {
221         if (hammer_lock_held(&cursor->node->lock) > 0)
222                 hammer_lock_downgrade(&cursor->node->lock);
223         if (cursor->parent && hammer_lock_held(&cursor->parent->lock) > 0)
224                 hammer_lock_downgrade(&cursor->parent->lock);
225 }
226
227
228 #if 0
229
230 /*
231  * Acquire the parent B-Tree node of the specified node, returning a
232  * referenced but unlocked node.  NULL can be returned with *errorp == 0
233  * if node is the root node of the root cluster.
234  */
235 static
236 hammer_node_t
237 hammer_get_parent_node(hammer_node_t node, int *errorp)
238 {
239         hammer_cluster_t cluster;
240         hammer_node_t parent;
241
242         cluster = node->cluster;
243         if (node->ondisk->parent) {
244                 /*
245                  * Local parent
246                  */
247                 parent = hammer_get_node(cluster, node->ondisk->parent, errorp);
248         } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0) {
249                 /*
250                  * At cluster root, locate node in parent cluster
251                  */
252                 hammer_cluster_ondisk_t ondisk;
253                 hammer_cluster_t pcluster;
254                 hammer_volume_t pvolume;
255                 int32_t clu_no;
256                 int32_t vol_no;
257
258                 ondisk = cluster->ondisk;
259                 vol_no = ondisk->clu_btree_parent_vol_no;
260                 clu_no = ondisk->clu_btree_parent_clu_no;
261
262                 /*
263                  * Acquire the node from (volume, cluster, offset)
264                  */
265                 pvolume = hammer_get_volume(cluster->volume->hmp, vol_no,
266                                             errorp);
267                 if (*errorp)
268                         return (NULL);
269                 pcluster = hammer_get_cluster(pvolume, clu_no, errorp, 0);
270                 hammer_rel_volume(pvolume, 0);
271                 if (*errorp)
272                         return (NULL);
273                 parent = hammer_get_node(pcluster,
274                                          ondisk->clu_btree_parent_offset,
275                                          errorp);
276                 hammer_rel_cluster(pcluster, 0);
277         } else {
278                 /*
279                  * At root of root cluster, there is no parent.
280                  */
281                 KKASSERT(cluster->ondisk->clu_btree_parent_vol_no == -1);
282                 parent = NULL;
283                 *errorp = 0;
284         }
285         return(parent);
286 }
287
288 #endif
289
290 /*
291  * Load the parent of cursor->node into cursor->parent.  There are several
292  * cases.  (1) The parent is in the current cluster.  (2) We are at the
293  * root of the cluster and the parent is in another cluster.  (3) We are at
294  * the root of the root cluster.
295  *
296  * If HAMMER_CURSOR_INCLUSTER is set and we are at the root of the cluster,
297  * we do not access the parent cluster at all and make the cursor look like
298  * its at the root.
299  */
300 static
301 int
302 hammer_load_cursor_parent(hammer_cursor_t cursor)
303 {
304         hammer_cluster_t cluster;
305         int error;
306
307         cluster = cursor->node->cluster;
308
309         if (cursor->node->ondisk->parent) {
310                 error = hammer_load_cursor_parent_local(cursor);
311         } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0 &&
312                    ((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0)
313         ) {
314                 error = hammer_load_cursor_parent_cluster(cursor);
315         } else {
316                 cursor->parent = NULL;
317                 cursor->parent_index = 0;
318                 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
319                 cursor->right_bound = &cluster->ondisk->clu_btree_end;
320                 error = 0;
321         }
322         return(error);
323 }
324
325 static
326 int
327 hammer_load_cursor_parent_local(hammer_cursor_t cursor)
328 {
329         hammer_node_t node;
330         hammer_node_t parent;
331         hammer_btree_elm_t elm;
332         int error;
333         int i;
334
335         node = cursor->node;
336         parent = hammer_get_node(node->cluster, node->ondisk->parent, &error);
337         if (error)
338                 return(error);
339         elm = NULL;
340         for (i = 0; i < parent->ondisk->count; ++i) {
341                 elm = &parent->ondisk->elms[i];
342                 if (parent->ondisk->elms[i].internal.subtree_offset ==
343                     node->node_offset) {
344                         break;
345                 }
346         }
347         if (i == parent->ondisk->count)
348                 panic("Bad B-Tree link: parent %p node %p\n", parent, node);
349         KKASSERT(i != parent->ondisk->count);
350         cursor->parent = parent;
351         cursor->parent_index = i;
352         cursor->left_bound = &elm[0].internal.base;
353         cursor->right_bound = &elm[1].internal.base;
354
355         hammer_lock_sh(&parent->lock);
356         return(error);
357 }
358
359 static
360 int
361 hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
362 {
363         hammer_cluster_ondisk_t ondisk;
364         hammer_cluster_t pcluster;
365         hammer_cluster_t ccluster;
366         hammer_volume_t volume;
367         hammer_node_t node;
368         hammer_node_t parent;
369         hammer_btree_elm_t elm;
370         int32_t clu_no;
371         int32_t vol_no;
372         int error;
373         int i;
374
375         node = cursor->node;
376         ccluster = node->cluster;
377         ondisk = ccluster->ondisk;
378         vol_no = ondisk->clu_btree_parent_vol_no;
379         clu_no = ondisk->clu_btree_parent_clu_no;
380
381         /*
382          * Acquire the node from (volume, cluster, offset).  This should
383          * be a leaf node containing the HAMMER_BTREE_TYPE_SPIKE_END element.
384          */
385         volume = hammer_get_volume(ccluster->volume->hmp, vol_no, &error);
386         if (error)
387                 return (error);
388         pcluster = hammer_get_cluster(volume, clu_no, &error, 0);
389         hammer_rel_volume(volume, 0);
390         if (error)
391                 return (error);
392         parent = hammer_get_node(pcluster, ondisk->clu_btree_parent_offset,
393                                  &error);
394         hammer_rel_cluster(pcluster, 0);
395         if (error)
396                 return (error);
397         KKASSERT(parent->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
398
399         /* 
400          * Ok, we have the node.  Locate the inter-cluster element
401          */
402         elm = NULL;
403         for (i = 0; i < parent->ondisk->count; ++i) {
404                 elm = &parent->ondisk->elms[i];
405
406                 if (elm->leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_END &&
407                     elm->leaf.spike_clu_no == cursor->node->cluster->clu_no) {
408                         break;
409                 }
410         }
411         KKASSERT(i != parent->ondisk->count);
412         KKASSERT(i && elm[-1].leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_BEG);
413         cursor->parent = parent;
414         cursor->parent_index = i;
415         cursor->left_bound = &ccluster->ondisk->clu_btree_beg;
416         cursor->right_bound = &ccluster->ondisk->clu_btree_end;
417
418         KKASSERT(hammer_btree_cmp(&elm[-1].leaf.base,
419                  &ccluster->ondisk->clu_btree_beg) == 0);
420             /*
421              * spike_end is an inclusive boundary and will != clu_btree_end
422         KKASSERT(hammer_btree_cmp(cursor->right_bound,
423                  &ccluster->ondisk->clu_btree_end) >= 0);
424             */
425
426         hammer_lock_sh(&parent->lock);
427         return(0);
428 }
429
430 /*
431  * Cursor up to our parent node.  Return ENOENT if we are at the root of
432  * the root cluster.
433  *
434  * If doing a nonblocking cursor-up and we are unable to acquire the lock,
435  * the cursor remains unchanged.
436  */
437 int
438 hammer_cursor_up(hammer_cursor_t cursor)
439 {
440         int error;
441
442         hammer_cursor_downgrade(cursor);
443
444         /*
445          * Set the node to its parent.  If the parent is NULL we are at
446          * the root of the root cluster and return ENOENT.
447          */
448         hammer_unlock(&cursor->node->lock);
449         hammer_rel_node(cursor->node);
450         cursor->node = cursor->parent;
451         cursor->index = cursor->parent_index;
452         cursor->parent = NULL;
453         cursor->parent_index = 0;
454
455         if (cursor->node == NULL) {
456                 error = ENOENT;
457         } else if ((cursor->flags & HAMMER_CURSOR_INCLUSTER) &&
458                    cursor->node->ondisk->parent == 0
459         ) {
460                 error = ENOENT;
461         } else {
462                 error = hammer_load_cursor_parent(cursor);
463         }
464         return(error);
465 }
466
467 /*
468  * Set the cursor to the root of the current cursor's cluster.
469  */
470 int
471 hammer_cursor_toroot(hammer_cursor_t cursor)
472 {
473         hammer_cluster_t cluster;
474         int error;
475
476         /*
477          * Already at root of cluster?
478          */
479         if (cursor->node->ondisk->parent == 0) 
480                 return (0);
481
482         hammer_cursor_downgrade(cursor);
483
484         /*
485          * Parent is root of cluster?
486          */
487         if (cursor->parent->ondisk->parent == 0)
488                 return (hammer_cursor_up(cursor));
489
490         /*
491          * Ok, reload the cursor with the root of the cluster, then
492          * locate its parent.
493          */
494         cluster = cursor->node->cluster;
495         error = hammer_ref_cluster(cluster);
496         if (error)
497                 return (error);
498
499         hammer_unlock(&cursor->parent->lock);
500         hammer_rel_node(cursor->parent);
501         hammer_unlock(&cursor->node->lock);
502         hammer_rel_node(cursor->node);
503         cursor->parent = NULL;
504         cursor->parent_index = 0;
505
506         cursor->node = hammer_get_node(cluster, cluster->ondisk->clu_btree_root,
507                                        &error);
508         cursor->index = 0;
509         hammer_lock_sh(&cursor->node->lock);
510         hammer_rel_cluster(cluster, 0);
511         if (error == 0)
512                 error = hammer_load_cursor_parent(cursor);
513         return(error);
514 }
515
516 /*
517  * Cursor down through the current node, which must be an internal node.
518  *
519  * This routine adjusts the cursor and sets index to 0.
520  */
521 int
522 hammer_cursor_down(hammer_cursor_t cursor)
523 {
524         hammer_node_t node;
525         hammer_btree_elm_t elm;
526         hammer_volume_t volume;
527         hammer_cluster_t cluster;
528         int32_t vol_no;
529         int32_t clu_no;
530         int error;
531
532         /*
533          * The current node becomes the current parent
534          */
535         hammer_cursor_downgrade(cursor);
536         node = cursor->node;
537         KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
538         if (cursor->parent) {
539                 hammer_unlock(&cursor->parent->lock);
540                 hammer_rel_node(cursor->parent);
541         }
542         cursor->parent = node;
543         cursor->parent_index = cursor->index;
544         cursor->node = NULL;
545         cursor->index = 0;
546
547         /*
548          * Extract element to push into at (node,index), set bounds.
549          */
550         elm = &node->ondisk->elms[cursor->parent_index];
551
552         /*
553          * Ok, push down into elm.  If elm specifies an internal or leaf
554          * node the current node must be an internal node.  If elm specifies
555          * a spike then the current node must be a leaf node.
556          *
557          * Cursoring down through a cluster transition when the INCLUSTER
558          * flag is set is not legal.
559          */
560         switch(elm->base.btype) {
561         case HAMMER_BTREE_TYPE_INTERNAL:
562         case HAMMER_BTREE_TYPE_LEAF:
563                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
564                 KKASSERT(elm->internal.subtree_offset != 0);
565                 cursor->left_bound = &elm[0].internal.base;
566                 cursor->right_bound = &elm[1].internal.base;
567                 node = hammer_get_node(node->cluster,
568                                        elm->internal.subtree_offset,
569                                        &error);
570                 if (error == 0) {
571                         KKASSERT(elm->base.btype == node->ondisk->type);
572                         if (node->ondisk->parent != cursor->parent->node_offset)
573                                 panic("node %p %d vs %d\n", node, node->ondisk->parent, cursor->parent->node_offset);
574                         KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
575                 }
576                 break;
577         case HAMMER_BTREE_TYPE_SPIKE_BEG:
578                 /* case not supported yet */
579                 KKASSERT(0);
580                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
581                 KKASSERT(cursor->parent_index < node->ondisk->count - 1);
582                 KKASSERT(elm[1].leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_END);
583                 ++elm;
584                 ++cursor->parent_index;
585                 /* fall through */
586         case HAMMER_BTREE_TYPE_SPIKE_END:
587                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
588                 KKASSERT((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0);
589                 vol_no = elm->leaf.spike_vol_no;
590                 clu_no = elm->leaf.spike_clu_no;
591                 volume = hammer_get_volume(node->cluster->volume->hmp,
592                                            vol_no, &error);
593                 KKASSERT(error != EINVAL);
594                 if (error)
595                         return(error);
596                 cluster = hammer_get_cluster(volume, clu_no, &error, 0);
597                 KKASSERT(error != EINVAL);
598                 hammer_rel_volume(volume, 0);
599                 if (error)
600                         return(error);
601                 KKASSERT(cluster->ondisk->clu_btree_parent_offset ==
602                          cursor->parent->node_offset);
603                 KKASSERT(cluster->ondisk->clu_btree_parent_clu_no ==
604                          cursor->parent->cluster->clu_no);
605
606                 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
607                 cursor->right_bound = &cluster->ondisk->clu_btree_end;
608                 node = hammer_get_node(cluster,
609                                        cluster->ondisk->clu_btree_root,
610                                        &error);
611                 hammer_rel_cluster(cluster, 0);
612                 break;
613         default:
614                 panic("hammer_cursor_down: illegal btype %02x (%c)\n",
615                       elm->base.btype,
616                       (elm->base.btype ? elm->base.btype : '?'));
617                 break;
618         }
619         if (error == 0) {
620                 hammer_lock_sh(&node->lock);
621                 cursor->node = node;
622                 cursor->index = 0;
623         }
624         return(error);
625 }
626