HAMMER 20B/many: New spike topology, simplify the B-Tree code.
[dragonfly.git] / sys / vfs / hammer / hammer_cursor.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.13 2008/01/17 05:06:09 dillon Exp $
35  */
36
37 /*
38  * HAMMER B-Tree index - cursor support routines
39  */
40 #include "hammer.h"
41
42 static int hammer_load_cursor_parent(hammer_cursor_t cursor);
43 static int hammer_load_cursor_parent_local(hammer_cursor_t cursor);
44 static int hammer_load_cursor_parent_cluster(hammer_cursor_t cursor);
45
46 /*
47  * Initialize a fresh cursor using the B-Tree node cache.  If the cache
48  * is not available initialize a fresh cursor at the root of the root
49  * cluster.
50  */
51 int
52 hammer_init_cursor_hmp(hammer_cursor_t cursor, struct hammer_node **cache,
53                        hammer_mount_t hmp)
54 {
55         hammer_cluster_t cluster;
56         hammer_node_t node;
57         int error;
58
59         bzero(cursor, sizeof(*cursor));
60
61         /*
62          * Step 1 - acquire a locked node from the cache if possible
63          */
64         if (cache && *cache) {
65                 node = hammer_ref_node_safe(hmp, cache, &error);
66                 if (error == 0) {
67                         hammer_lock_ex(&node->lock);
68                         if (node->flags & HAMMER_NODE_DELETED) {
69                                 hammer_unlock(&node->lock);
70                                 hammer_rel_node(node);
71                                 node = NULL;
72                         }
73                 }
74         } else {
75                 node = NULL;
76         }
77
78         /*
79          * Step 2 - If we couldn't get a node from the cache, get
80          * the one from the root of the root cluster.
81          */
82         while (node == NULL) {
83                 cluster = hammer_get_root_cluster(hmp, &error);
84                 if (error)
85                         break;
86                 node = hammer_get_node(cluster,
87                                        cluster->ondisk->clu_btree_root,
88                                        &error);
89                 hammer_rel_cluster(cluster, 0);
90                 if (error)
91                         break;
92                 hammer_lock_ex(&node->lock);
93                 if (node->flags & HAMMER_NODE_DELETED) {
94                         hammer_unlock(&node->lock);
95                         hammer_rel_node(node);
96                         node = NULL;
97                 }
98         }
99
100         /*
101          * Step 3 - finish initializing the cursor by acquiring the parent
102          */
103         cursor->node = node;
104         if (error == 0)
105                 error = hammer_load_cursor_parent(cursor);
106         KKASSERT(error == 0);
107         return(error);
108 }
109
110 /*
111  * Initialize a fresh cursor at the root of the specified cluster and
112  * limit operations to within the cluster.
113  */
114 int
115 hammer_init_cursor_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster)
116 {
117         int error;
118
119         bzero(cursor, sizeof(*cursor));
120         cursor->flags |= HAMMER_CURSOR_INCLUSTER;
121         cursor->node = hammer_get_node(cluster,
122                                        cluster->ondisk->clu_btree_root,
123                                        &error);
124         if (error == 0) {
125                 hammer_lock_ex(&cursor->node->lock);
126                 error = hammer_load_cursor_parent(cursor);
127         }
128         KKASSERT(error == 0);
129         return(error);
130 }
131
132 /*
133  * We are finished with a cursor.  We NULL out various fields as sanity
134  * check, in case the structure is inappropriately used afterwords.
135  */
136 void
137 hammer_done_cursor(hammer_cursor_t cursor)
138 {
139         if (cursor->parent) {
140                 hammer_unlock(&cursor->parent->lock);
141                 hammer_rel_node(cursor->parent);
142                 cursor->parent = NULL;
143         }
144         if (cursor->node) {
145                 hammer_unlock(&cursor->node->lock);
146                 hammer_rel_node(cursor->node);
147                 cursor->node = NULL;
148         }
149         if (cursor->data_buffer) {
150                 hammer_rel_buffer(cursor->data_buffer, 0);
151                 cursor->data_buffer = NULL;
152         }
153         if (cursor->record_buffer) {
154                 hammer_rel_buffer(cursor->record_buffer, 0);
155                 cursor->record_buffer = NULL;
156         }
157         if (cursor->ip)
158                 hammer_mem_done(cursor);
159
160         cursor->data = NULL;
161         cursor->record = NULL;
162         cursor->left_bound = NULL;
163         cursor->right_bound = NULL;
164 }
165
166 /*
167  * Acquire the parent B-Tree node of the specified node, returning a
168  * referenced but unlocked node.  NULL can be returned with *errorp == 0
169  * if node is the root node of the root cluster.
170  */
171 static
172 hammer_node_t
173 hammer_get_parent_node(hammer_node_t node, int *errorp)
174 {
175         hammer_cluster_t cluster;
176         hammer_node_t parent;
177
178         cluster = node->cluster;
179         if (node->ondisk->parent) {
180                 /*
181                  * Local parent
182                  */
183                 parent = hammer_get_node(cluster, node->ondisk->parent, errorp);
184         } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0) {
185                 /*
186                  * At cluster root, locate node in parent cluster
187                  */
188                 hammer_cluster_ondisk_t ondisk;
189                 hammer_cluster_t pcluster;
190                 hammer_volume_t pvolume;
191                 int32_t clu_no;
192                 int32_t vol_no;
193
194                 ondisk = cluster->ondisk;
195                 vol_no = ondisk->clu_btree_parent_vol_no;
196                 clu_no = ondisk->clu_btree_parent_clu_no;
197
198                 /*
199                  * Acquire the node from (volume, cluster, offset)
200                  */
201                 pvolume = hammer_get_volume(cluster->volume->hmp, vol_no,
202                                             errorp);
203                 if (*errorp)
204                         return (NULL);
205                 pcluster = hammer_get_cluster(pvolume, clu_no, errorp, 0);
206                 hammer_rel_volume(pvolume, 0);
207                 if (*errorp)
208                         return (NULL);
209                 parent = hammer_get_node(pcluster,
210                                          ondisk->clu_btree_parent_offset,
211                                          errorp);
212                 hammer_rel_cluster(pcluster, 0);
213         } else {
214                 /*
215                  * At root of root cluster, there is no parent.
216                  */
217                 KKASSERT(cluster->ondisk->clu_btree_parent_vol_no == -1);
218                 parent = NULL;
219                 *errorp = 0;
220         }
221         return(parent);
222 }
223
224 /*
225  * Load the parent of cursor->node into cursor->parent.  There are several
226  * cases.  (1) The parent is in the current cluster.  (2) We are at the
227  * root of the cluster and the parent is in another cluster.  (3) We are at
228  * the root of the root cluster.
229  *
230  * If HAMMER_CURSOR_INCLUSTER is set and we are at the root of the cluster,
231  * we do not access the parent cluster at all and make the cursor look like
232  * its at the root.
233  */
234 static
235 int
236 hammer_load_cursor_parent(hammer_cursor_t cursor)
237 {
238         hammer_cluster_t cluster;
239         int error;
240
241         cluster = cursor->node->cluster;
242
243         if (cursor->node->ondisk->parent) {
244                 error = hammer_load_cursor_parent_local(cursor);
245         } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0 &&
246                    ((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0)
247         ) {
248                 error = hammer_load_cursor_parent_cluster(cursor);
249         } else {
250                 cursor->parent = NULL;
251                 cursor->parent_index = 0;
252                 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
253                 cursor->right_bound = &cluster->ondisk->clu_btree_end;
254                 error = 0;
255         }
256         return(error);
257 }
258
259 static
260 int
261 hammer_load_cursor_parent_local(hammer_cursor_t cursor)
262 {
263         hammer_node_t node;
264         hammer_node_t parent;
265         hammer_btree_elm_t elm;
266         int error;
267         int i;
268
269         node = cursor->node;
270         parent = hammer_get_node(node->cluster, node->ondisk->parent, &error);
271         if (error)
272                 return(error);
273         elm = NULL;
274         for (i = 0; i < parent->ondisk->count; ++i) {
275                 elm = &parent->ondisk->elms[i];
276                 if (parent->ondisk->elms[i].internal.subtree_offset ==
277                     node->node_offset) {
278                         break;
279                 }
280         }
281         if (i == parent->ondisk->count)
282                 panic("Bad B-Tree link: parent %p node %p\n", parent, node);
283         KKASSERT(i != parent->ondisk->count);
284         cursor->parent = parent;
285         cursor->parent_index = i;
286         cursor->left_bound = &elm[0].internal.base;
287         cursor->right_bound = &elm[1].internal.base;
288
289         if (hammer_lock_ex_try(&parent->lock) != 0) {
290                 hammer_unlock(&cursor->node->lock);
291                 hammer_lock_ex(&parent->lock);
292                 hammer_lock_ex(&cursor->node->lock);
293                 /* XXX check node generation count */
294         }
295         return(error);
296 }
297
298 static
299 int
300 hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
301 {
302         hammer_cluster_ondisk_t ondisk;
303         hammer_cluster_t pcluster;
304         hammer_cluster_t ccluster;
305         hammer_volume_t volume;
306         hammer_node_t node;
307         hammer_node_t parent;
308         hammer_btree_elm_t elm;
309         int32_t clu_no;
310         int32_t vol_no;
311         int error;
312         int i;
313
314         node = cursor->node;
315         ccluster = node->cluster;
316         ondisk = ccluster->ondisk;
317         vol_no = ondisk->clu_btree_parent_vol_no;
318         clu_no = ondisk->clu_btree_parent_clu_no;
319
320         /*
321          * Acquire the node from (volume, cluster, offset).  This should
322          * be a leaf node containing the HAMMER_BTREE_TYPE_SPIKE_END element.
323          */
324         volume = hammer_get_volume(ccluster->volume->hmp, vol_no, &error);
325         if (error)
326                 return (error);
327         pcluster = hammer_get_cluster(volume, clu_no, &error, 0);
328         hammer_rel_volume(volume, 0);
329         if (error)
330                 return (error);
331         parent = hammer_get_node(pcluster, ondisk->clu_btree_parent_offset,
332                                  &error);
333         hammer_rel_cluster(pcluster, 0);
334         if (error)
335                 return (error);
336         KKASSERT(parent->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
337
338         /* 
339          * Ok, we have the node.  Locate the inter-cluster element
340          */
341         elm = NULL;
342         for (i = 0; i < parent->ondisk->count; ++i) {
343                 elm = &parent->ondisk->elms[i];
344
345                 if (elm->leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_END &&
346                     elm->leaf.spike_clu_no == cursor->node->cluster->clu_no) {
347                         break;
348                 }
349         }
350         KKASSERT(i != parent->ondisk->count);
351         KKASSERT(i && elm[-1].leaf.base.btype == HAMMER_BTREE_TYPE_SPIKE_BEG);
352         cursor->parent = parent;
353         cursor->parent_index = i;
354         cursor->left_bound = &ccluster->ondisk->clu_btree_beg;
355         cursor->right_bound = &ccluster->ondisk->clu_btree_end;
356
357         KKASSERT(hammer_btree_cmp(&elm[-1].leaf.base,
358                  &ccluster->ondisk->clu_btree_beg) == 0);
359             /*
360              * spike_end is an inclusive boundary and will != clu_btree_end
361         KKASSERT(hammer_btree_cmp(cursor->right_bound,
362                  &ccluster->ondisk->clu_btree_end) >= 0);
363             */
364
365         if (hammer_lock_ex_try(&parent->lock) != 0) {
366                 hammer_unlock(&cursor->node->lock);
367                 hammer_lock_ex(&parent->lock);
368                 hammer_lock_ex(&cursor->node->lock);
369                 /* XXX check node generation count */
370         }
371         return(0);
372 }
373
374 /*
375  * Cursor up to our parent node.  Return ENOENT if we are at the root of
376  * the root cluster.
377  *
378  * If doing a nonblocking cursor-up and we are unable to acquire the lock,
379  * the cursor remains unchanged.
380  */
381 int
382 hammer_cursor_up(hammer_cursor_t cursor, int nonblock)
383 {
384         hammer_node_t save;
385         int error;
386
387         /*
388          * If asked to do this non-blocking acquire a lock on the parent
389          * now, before we mess with the cursor.
390          */
391         if (nonblock) {
392                 save = hammer_get_parent_node(cursor->parent, &error);
393                 if (error)
394                         return(error);
395                 if (save) {
396                         if (hammer_lock_ex_try(&save->lock) != 0) {
397                                 hammer_rel_node(save);
398                                 return(EAGAIN);
399                         }
400                 }
401         } else {
402                 save = NULL;
403         }
404
405         /*
406          * Set the node to its parent.  If the parent is NULL we are at
407          * the root of the root cluster and return ENOENT.
408          */
409         hammer_unlock(&cursor->node->lock);
410         hammer_rel_node(cursor->node);
411         cursor->node = cursor->parent;
412         cursor->index = cursor->parent_index;
413         cursor->parent = NULL;
414         cursor->parent_index = 0;
415
416         if (cursor->node == NULL) {
417                 error = ENOENT;
418         } else if ((cursor->flags & HAMMER_CURSOR_INCLUSTER) &&
419                    cursor->node->ondisk->parent == 0
420         ) {
421                 error = ENOENT;
422         } else {
423                 error = hammer_load_cursor_parent(cursor);
424         }
425         if (save) {
426                 hammer_unlock(&save->lock);
427                 hammer_rel_node(save);
428         }
429         return(error);
430 }
431
432 /*
433  * Set the cursor to the root of the current cursor's cluster.
434  */
435 int
436 hammer_cursor_toroot(hammer_cursor_t cursor)
437 {
438         hammer_cluster_t cluster;
439         int error;
440
441         /*
442          * Already at root of cluster?
443          */
444         if (cursor->node->ondisk->parent == 0) 
445                 return (0);
446
447         /*
448          * Parent is root of cluster?
449          */
450         if (cursor->parent->ondisk->parent == 0)
451                 return (hammer_cursor_up(cursor, 0));
452
453         /*
454          * Ok, reload the cursor with the root of the cluster, then
455          * locate its parent.
456          */
457         cluster = cursor->node->cluster;
458         error = hammer_ref_cluster(cluster);
459         if (error)
460                 return (error);
461
462         hammer_unlock(&cursor->parent->lock);
463         hammer_rel_node(cursor->parent);
464         hammer_unlock(&cursor->node->lock);
465         hammer_rel_node(cursor->node);
466         cursor->parent = NULL;
467         cursor->parent_index = 0;
468
469         cursor->node = hammer_get_node(cluster, cluster->ondisk->clu_btree_root,
470                                        &error);
471         cursor->index = 0;
472         hammer_lock_ex(&cursor->node->lock);
473         hammer_rel_cluster(cluster, 0);
474         if (error == 0)
475                 error = hammer_load_cursor_parent(cursor);
476         return(error);
477 }
478
479 /*
480  * Cursor down through the current node, which must be an internal node.
481  *
482  * This routine adjusts the cursor and sets index to 0.
483  */
484 int
485 hammer_cursor_down(hammer_cursor_t cursor)
486 {
487         hammer_node_t node;
488         hammer_btree_elm_t elm;
489         hammer_volume_t volume;
490         hammer_cluster_t cluster;
491         int32_t vol_no;
492         int32_t clu_no;
493         int error;
494
495         /*
496          * The current node becomes the current parent
497          */
498         node = cursor->node;
499         KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count);
500         if (cursor->parent) {
501                 hammer_unlock(&cursor->parent->lock);
502                 hammer_rel_node(cursor->parent);
503         }
504         cursor->parent = node;
505         cursor->parent_index = cursor->index;
506         cursor->node = NULL;
507         cursor->index = 0;
508
509         /*
510          * Extract element to push into at (node,index), set bounds.
511          */
512         elm = &node->ondisk->elms[cursor->parent_index];
513
514         /*
515          * Ok, push down into elm.  If elm specifies an internal or leaf
516          * node the current node must be an internal node.  If elm specifies
517          * a spike then the current node must be a leaf node.
518          *
519          * Cursoring down through a cluster transition when the INCLUSTER
520          * flag is set is not legal.
521          */
522         switch(elm->base.btype) {
523         case HAMMER_BTREE_TYPE_INTERNAL:
524         case HAMMER_BTREE_TYPE_LEAF:
525                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL);
526                 KKASSERT(elm->internal.subtree_offset != 0);
527                 cursor->left_bound = &elm[0].internal.base;
528                 cursor->right_bound = &elm[1].internal.base;
529                 node = hammer_get_node(node->cluster,
530                                        elm->internal.subtree_offset,
531                                        &error);
532                 if (error == 0) {
533                         KKASSERT(elm->base.btype == node->ondisk->type);
534                         if(node->ondisk->parent != cursor->parent->node_offset)
535                                 kprintf("node %p %d vs %d\n", node, node->ondisk->parent, cursor->parent->node_offset);
536                         KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
537                 }
538                 break;
539         case HAMMER_BTREE_TYPE_SPIKE_BEG:
540         case HAMMER_BTREE_TYPE_SPIKE_END:
541                 KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
542                 KKASSERT((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0);
543                 vol_no = elm->leaf.spike_vol_no;
544                 clu_no = elm->leaf.spike_clu_no;
545                 volume = hammer_get_volume(node->cluster->volume->hmp,
546                                            vol_no, &error);
547                 KKASSERT(error != EINVAL);
548                 if (error)
549                         return(error);
550                 cluster = hammer_get_cluster(volume, clu_no, &error, 0);
551                 KKASSERT(error != EINVAL);
552                 hammer_rel_volume(volume, 0);
553                 if (error)
554                         return(error);
555                 cursor->left_bound = &cluster->ondisk->clu_btree_beg;
556                 cursor->right_bound = &cluster->ondisk->clu_btree_end;
557                 node = hammer_get_node(cluster,
558                                        cluster->ondisk->clu_btree_root,
559                                        &error);
560                 hammer_rel_cluster(cluster, 0);
561                 break;
562         default:
563                 panic("hammer_cursor_down: illegal btype %02x (%c)\n",
564                       elm->base.btype,
565                       (elm->base.btype ? elm->base.btype : '?'));
566                 break;
567         }
568         if (error == 0) {
569                 hammer_lock_ex(&node->lock);
570                 cursor->node = node;
571                 cursor->index = 0;
572         }
573         return(error);
574 }
575