Merge from vendor branch TNFTP:
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.8 2008/02/08 08:31:00 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 #if 0
40
41 static int hammer_recover_buffer_stage2(hammer_cluster_t cluster,
42                                 int32_t buf_no);
43 static int hammer_recover_record(hammer_cluster_t cluster,
44                                 hammer_buffer_t buffer, int32_t rec_offset,
45                                 hammer_record_ondisk_t rec);
46 static int hammer_recover_btree(hammer_cluster_t cluster,
47                                 hammer_buffer_t buffer, int32_t rec_offset,
48                                 hammer_record_ondisk_t rec);
49
50 /*
51  * Recover a cluster.  The caller has referenced and locked the cluster.
52  * 
53  * Generally returns 0 on success and EIO if the recovery was unsuccessful.
54  *
55  * WARNING!  The cluster being recovered must not have any cached buffers
56  * (and hence no cached b-tree nodes).  Any cached nodes will become seriously
57  * corrupted since we rip it all up and regenerate the B-Tree.
58  */
59 int
60 hammer_recover(hammer_cluster_t cluster)
61 {
62         int buf_no;
63         int rec_no;
64         int maxblk;
65         int nbuffers;
66         int buffer_count;
67         int record_count;
68
69         kprintf("HAMMER_RECOVER %d:%d\n",
70                 cluster->volume->vol_no, cluster->clu_no);
71         /*Debugger("RECOVER");*/
72         KKASSERT(cluster->ondisk->synchronized_rec_id);
73         if (RB_ROOT(&cluster->rb_bufs_root)) {
74                 panic("hammer_recover: cluster %d:%d has cached buffers!",
75                         cluster->volume->vol_no,
76                         cluster->clu_no);
77         }
78
79         if (hammer_alist_find(&cluster->volume->alist, cluster->clu_no,
80                               cluster->clu_no + 1, 0) != cluster->clu_no) {
81                 Debugger("hammer_recover: cluster not allocated!");
82         }
83
84         nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE;
85         hammer_modify_cluster(cluster);
86
87         /*
88          * Clear statistics.
89          */
90         cluster->ondisk->stat_inodes = 0;
91         cluster->ondisk->stat_records = 0;
92         cluster->ondisk->stat_data_bufs = 0;
93         cluster->ondisk->stat_rec_bufs = 0;
94         cluster->ondisk->stat_idx_bufs = 0;
95
96         /*
97          * Reset allocation heuristics.
98          */
99         cluster->ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
100         cluster->ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
101         cluster->ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
102
103         /*
104          * Re-initialize the master, B-Tree, and mdata A-lists, and
105          * recover the record A-list.
106          */
107         hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
108                           HAMMER_ASTATE_FREE);
109         hammer_alist_init(&cluster->alist_btree,
110                           HAMMER_FSBUF_MAXBLKS,
111                           (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
112                           HAMMER_ASTATE_ALLOC);
113         hammer_alist_init(&cluster->alist_mdata,
114                           HAMMER_FSBUF_MAXBLKS,
115                           (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
116                           HAMMER_ASTATE_ALLOC);
117         hammer_alist_recover(&cluster->alist_record,
118                           0,
119                           HAMMER_FSBUF_MAXBLKS,
120                           (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS);
121         kprintf("\n");
122
123         kprintf("hammer_recover(1): cluster_free %d\n",
124                 cluster->alist_master.meta->bm_alist_freeblks);
125
126         /*
127          * The cluster is now in good enough shape that general allocations
128          * are possible.  Construct an empty B-Tree root.
129          */
130         {
131                 hammer_node_t croot;
132                 int error;
133
134                 croot = hammer_alloc_btree(cluster, &error);
135                 if (error == 0) {
136                         hammer_modify_node(croot);
137                         bzero(croot->ondisk, sizeof(*croot->ondisk));
138                         croot->ondisk->count = 0;
139                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
140                         cluster->ondisk->clu_btree_root = croot->node_offset;
141                         hammer_rel_node(croot);
142                 }
143                 KKASSERT(error == 0);
144         }
145         kprintf("hammer_recover(2): cluster_free %d\n",
146                 cluster->alist_master.meta->bm_alist_freeblks);
147
148         /*
149          * Scan the cluster's recovered record A-list.  Just get the meta
150          * blocks and ignore all-allocated/uninitialized sections (which
151          * we use to indicate reserved areas not assigned to record buffers).
152          *
153          * The all-free sections are initialized and this is indicated by
154          * the alist config's bl_inverted flag being set.  These sections
155          * will be returned for recovery purposes.
156          */
157         buffer_count = 0;
158         record_count = 0;
159
160         rec_no = HAMMER_FSBUF_MAXBLKS;
161         maxblk = nbuffers * HAMMER_FSBUF_MAXBLKS;
162         for (;;) {
163                 rec_no = hammer_alist_find(&cluster->alist_record,
164                                            rec_no,
165                                            maxblk,
166                                            HAMMER_ALIST_FIND_NOSTACK |
167                                            HAMMER_ALIST_FIND_INITONLY);
168                 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
169                         break;
170                 buf_no = rec_no / HAMMER_FSBUF_MAXBLKS;
171                 KKASSERT(buf_no > 0 && buf_no <= nbuffers);
172                 ++buffer_count;
173                 kprintf("(%d)", buf_no);
174                 record_count += hammer_recover_buffer_stage2(cluster, buf_no);
175                 rec_no += HAMMER_FSBUF_MAXBLKS;
176         }
177         kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n",
178                 cluster->volume->vol_no, cluster->clu_no,
179                 buffer_count, record_count);
180
181         /*
182          * Validate the parent cluster pointer. XXX
183          */
184
185         /*
186          * On successful recovery mark the cluster validated.
187          */
188         cluster->io.validated = 1;
189         return(0);
190 }
191
192 /*
193  * This is used in the alist callback and must return a negative error
194  * code or a positive free block count.
195  */
196 int
197 buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
198 {
199         hammer_cluster_t cluster;
200         hammer_record_ondisk_t rec;
201         hammer_buffer_t buffer;
202         int32_t buf_no;
203         int32_t rec_no;
204         int32_t rec_offset;
205         int32_t r;
206         int error;
207         int xcount;
208
209         /*
210          * Extract cluster and buffer number to recover
211          */
212         cluster = info;
213         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
214
215         kprintf("(%d)", buf_no);
216         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
217         if (error) {
218                 /*
219                  * If we are unable to access the buffer leave it in a
220                  * reserved state on the master alist.
221                  */
222                 kprintf("hammer_recover_buffer_stage1: error "
223                         "recovering %d:%d:%d\n",
224                         cluster->volume->vol_no, cluster->clu_no, buf_no);
225                 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
226                 KKASSERT(r == buf_no);
227                 return(-error);
228         }
229         KKASSERT(buffer->buf_type == HAMMER_FSBUF_RECORDS);
230
231         /*
232          * If the buffer contains no allocated records tell our parent to
233          * mark it as all-allocated/uninitialized and do not reserve it
234          * in the master list.
235          */
236         if (hammer_alist_find(&buffer->alist, 0, HAMMER_RECORD_NODES, 0) ==
237             HAMMER_ALIST_BLOCK_NONE) {
238                 kprintf("GENERAL RECOVERY BUFFER %d\n",
239                         blk / HAMMER_FSBUF_MAXBLKS);
240                 hammer_rel_buffer(buffer, 0);
241                 return(-EDOM);
242         }
243
244
245         /*
246          * Mark the buffer as allocated in the cluster's master A-list.
247          */
248         r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
249         KKASSERT(r == buf_no);
250         ++cluster->ondisk->stat_rec_bufs;
251
252         kprintf("recover buffer1 %d:%d:%d cluster_free %d\n",
253                 cluster->volume->vol_no,
254                 cluster->clu_no, buf_no,
255                 cluster->alist_master.meta->bm_alist_freeblks);
256
257         /*
258          * Recover the buffer, scan and validate allocated records.  Records
259          * which cannot be recovered are freed.
260          *
261          * The parent a-list must be properly adjusted so don't just call
262          * hammer_alist_recover() on the underlying buffer.  Go through the
263          * parent.
264          */
265         hammer_modify_buffer(buffer);
266         count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES);
267         xcount = 0;
268         kprintf("hammer_recover_buffer count1 %d/%d\n",
269                 HAMMER_RECORD_NODES - count, HAMMER_RECORD_NODES);
270         rec_no = 0;
271         for (;;) {
272                 rec_no = hammer_alist_find(&buffer->alist, rec_no,
273                                            HAMMER_RECORD_NODES, 0);
274                 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
275                         break;
276 #if 0
277                 kprintf("recover record %d:%d:%d %d\n",
278                         cluster->volume->vol_no,
279                         cluster->clu_no, buf_no, rec_no);
280 #endif
281                 rec_offset = offsetof(union hammer_fsbuf_ondisk,
282                                       record.recs[rec_no]);
283                 rec_offset += buf_no * HAMMER_BUFSIZE;
284                 rec = &buffer->ondisk->record.recs[rec_no];
285                 error = hammer_recover_record(cluster, buffer, rec_offset, rec);
286                 if (error) {
287                         kprintf("hammer_recover_record: failed %d:%d@%d\n",
288                                 cluster->clu_no, buffer->buf_no, rec_offset);
289                         hammer_alist_free(&buffer->alist, rec_no, 1);
290                         if (hammer_debug_recover_faults)
291                                 Debugger("FAILED");
292                         ++count;        /* free count */
293                         --xcount;
294                 }
295                 ++rec_no;
296                 ++xcount;
297         }
298         kprintf("hammer_recover_buffer count2 %d/%d/%d\n",
299                 HAMMER_RECORD_NODES - count, xcount, HAMMER_RECORD_NODES);
300         KKASSERT(HAMMER_RECORD_NODES - count == xcount);
301         hammer_rel_buffer(buffer, 0);
302         return(count);
303 }
304
305 /*
306  * Recover a record, at least into a state that doesn't blow up the
307  * filesystem.  Returns 0 on success, non-zero if the record is
308  * unrecoverable.
309  */
310 static int
311 hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer,
312                              int32_t rec_offset, hammer_record_ondisk_t rec)
313 {
314         hammer_buffer_t dbuf;
315         u_int64_t syncid = cluster->ondisk->synchronized_rec_id;
316         int32_t data_offset;
317         int32_t data_len;
318         int32_t nblks;
319         int32_t dbuf_no;
320         int32_t dblk_no;
321         int32_t base_blk;
322         int32_t r;
323         int error = 0;
324
325         /*
326          * We have to discard any records with rec_id's greater then the
327          * last sync of the cluster header (which guarenteed all related
328          * buffers had been synced).  Otherwise the record may reference
329          * information that was never synced to disk.
330          */
331         if (rec->base.rec_id >= syncid) {
332                 kprintf("recover record: syncid too large %016llx/%016llx\n",
333                         rec->base.rec_id, syncid);
334                 if (hammer_debug_recover_faults)
335                         Debugger("DebugSyncid");
336                 return(EINVAL);
337         }
338
339 #if 0
340         /* XXX undo incomplete deletions */
341         if (rec->base.base.delete_tid > syncid)
342                 rec->base.base.delete_tid = 0;
343 #endif
344
345         /*
346          * Validate the record's B-Tree key
347          */
348         KKASSERT(rec->base.base.rec_type != 0);
349         if (rec->base.base.rec_type != HAMMER_RECTYPE_CLUSTER) {
350                 if (hammer_btree_cmp(&rec->base.base,
351                                      &cluster->ondisk->clu_btree_beg) < 0)  {
352                         kprintf("recover record: range low\n");
353                         Debugger("RANGE LOW");
354                         return(EINVAL);
355                 }
356                 if (hammer_btree_cmp(&rec->base.base,
357                                      &cluster->ondisk->clu_btree_end) >= 0)  {
358                         kprintf("recover record: range high\n");
359                         Debugger("RANGE HIGH");
360                         return(EINVAL);
361                 }
362         }
363
364         /*
365          * Validate the record's data.  If the offset is 0 there is no data
366          * (or it is zero-fill) and we can return success immediately.
367          * Otherwise make sure everything is ok.
368          */
369         data_offset = rec->base.data_offset;
370         data_len = rec->base.data_len;
371
372         if (data_len == 0)
373                 rec->base.data_offset = data_offset = 0;
374         if (data_offset == 0)
375                 goto done;
376
377         /*
378          * Non-zero data offset, recover the data
379          */
380         if (data_offset < HAMMER_BUFSIZE ||
381             data_offset >= cluster->ondisk->clu_limit ||
382             data_len < 0 || data_len > HAMMER_MAXDATA ||
383             data_offset + data_len > cluster->ondisk->clu_limit) {
384                 kprintf("recover record: bad offset/len %d/%d\n",
385                         data_offset, data_len);
386                 Debugger("BAD OFFSET");
387                 return(EINVAL);
388         }
389
390         /*
391          * Check data_offset relative to rec_offset
392          */
393         if (data_offset < rec_offset && data_offset + data_len > rec_offset) {
394                 kprintf("recover record: bad offset: overlapping1\n");
395                 Debugger("BAD OFFSET - OVERLAP1");
396                 return(EINVAL);
397         }
398         if (data_offset >= rec_offset &&
399             data_offset < rec_offset + sizeof(struct hammer_base_record)) {
400                 kprintf("recover record: bad offset: overlapping2\n");
401                 Debugger("BAD OFFSET - OVERLAP2");
402                 return(EINVAL);
403         }
404
405         /*
406          * Check for data embedded in the record
407          */
408         if (data_offset >= rec_offset &&
409             data_offset < rec_offset + HAMMER_RECORD_SIZE) {
410                 if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) {
411                         kprintf("recover record: bad offset: overlapping3\n");
412                         Debugger("BAD OFFSET - OVERLAP3");
413                         return(EINVAL);
414                 }
415                 goto done;
416         }
417
418         KKASSERT(cluster->io.modified);
419         /*
420          * Recover the allocated data either out of the cluster's master alist
421          * or as a buffer sub-allocation.
422          */
423         if ((data_len & HAMMER_BUFMASK) == 0) {
424                 if (data_offset & HAMMER_BUFMASK) {
425                         kprintf("recover record: bad offset: unaligned\n");
426                         Debugger("BAD OFFSET - UNALIGNED");
427                         return(EINVAL);
428                 }
429                 nblks = data_len / HAMMER_BUFSIZE;
430                 dbuf_no = data_offset / HAMMER_BUFSIZE;
431                 /* XXX power-of-2 check data_len */
432
433                 r = hammer_alist_alloc_fwd(&cluster->alist_master,
434                                            nblks, dbuf_no);
435                 if (r == HAMMER_ALIST_BLOCK_NONE) {
436                         kprintf("recover record: cannot recover offset1\n");
437                         Debugger("CANNOT ALLOC DATABUFFER");
438                         return(EINVAL);
439                 }
440                 if (r != dbuf_no) {
441                         kprintf("recover record: cannot recover offset2\n");
442                         hammer_alist_free(&cluster->alist_master, r, nblks);
443                         KKASSERT(0);
444                         return(EINVAL);
445                 }
446                 ++cluster->ondisk->stat_data_bufs;
447         } else {
448                 if ((data_offset & ~HAMMER_BUFMASK) !=
449                     ((data_offset + data_len - 1) & ~HAMMER_BUFMASK)) {
450                         kprintf("recover record: overlaps multiple bufs\n");
451                         Debugger("OVERLAP MULT");
452                         return(EINVAL);
453                 }
454                 if ((data_offset & HAMMER_BUFMASK) <
455                     sizeof(struct hammer_fsbuf_head)) {
456                         kprintf("recover record: data in header area\n");
457                         Debugger("DATA IN HEADER AREA");
458                         return(EINVAL);
459                 }
460                 if (data_offset & HAMMER_DATA_BLKMASK) {
461                         kprintf("recover record: data blk unaligned\n");
462                         Debugger("DATA BLK UNALIGNED");
463                         return(EINVAL);
464                 }
465
466                 /*
467                  * Ok, recover the space in the data buffer.
468                  */
469                 dbuf_no = data_offset / HAMMER_BUFSIZE;
470                 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no);
471                 if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE)
472                         hammer_alist_free(&cluster->alist_master, r, 1);
473                 if (r == dbuf_no) {
474                         /*
475                          * This is the first time we've tried to recover
476                          * data in this data buffer, reinit it (but don't
477                          * zero it out, obviously).
478                          *
479                          * Calling initbuffer marks the data blocks within
480                          * the buffer as being all-allocated.  We have to
481                          * mark it free.
482                          */
483                         dbuf = hammer_get_buffer(cluster, dbuf_no,
484                                                  0, &error);
485                         if (error == 0) {
486                                 KKASSERT(dbuf->buf_type == HAMMER_FSBUF_DATA);
487                                 hammer_modify_buffer(dbuf);
488                                 hammer_initbuffer(&dbuf->alist, 
489                                                   &dbuf->ondisk->head,
490                                                   HAMMER_FSBUF_DATA);
491                                 /*dbuf->buf_type = HAMMER_FSBUF_DATA;*/
492                                 base_blk = dbuf_no * HAMMER_FSBUF_MAXBLKS;
493                                 hammer_alist_free(&cluster->alist_mdata,
494                                                   base_blk,
495                                                   HAMMER_DATA_NODES);
496                                 kprintf("FREE DATA %d/%d\n", base_blk, HAMMER_DATA_NODES);
497                                 ++cluster->ondisk->stat_data_bufs;
498                         }
499                 } else {
500                         /*
501                          * We've seen this data buffer before.
502                          */
503                         dbuf = hammer_get_buffer(cluster, dbuf_no,
504                                                  0, &error);
505                 }
506                 if (error) {
507                         kprintf("recover record: data: getbuf failed\n");
508                         KKASSERT(0);
509                         return(EINVAL);
510                 }
511
512                 if (dbuf->buf_type != HAMMER_FSBUF_DATA) {
513                         hammer_rel_buffer(dbuf, 0);
514                         kprintf("recover record: data: wrong buffer type\n");
515                         KKASSERT(0);
516                         return(EINVAL);
517                 }
518
519                 /*
520                  * Figure out the data block number and number of blocks.
521                  */
522                 nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
523                 nblks /= HAMMER_DATA_BLKSIZE;
524                 dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE;
525                 if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) {
526                         kprintf("dblk_no %d does not match data_offset %d/%d\n",
527                                 dblk_no,
528                                 offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]),
529                                 (data_offset & HAMMER_BUFMASK));
530                         hammer_rel_buffer(dbuf, 0);
531                         kprintf("recover record: data: not block aligned\n");
532                         Debugger("bad data");
533                         return(EINVAL);
534                 }
535                 hammer_modify_buffer(dbuf);
536                 dblk_no += dbuf_no * HAMMER_FSBUF_MAXBLKS;
537                 r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no);
538                 if (r != dblk_no) {
539                         if (r != HAMMER_ALIST_BLOCK_NONE)
540                                 hammer_alist_free(&cluster->alist_mdata, r, nblks);
541                         hammer_rel_buffer(dbuf, 0);
542                         kprintf("recover record: data: unable to realloc dbuf %d dblk %d\n", dbuf_no, dblk_no % HAMMER_FSBUF_MAXBLKS);
543                         KKASSERT(0);
544                         return(EINVAL);
545                 }
546                 hammer_rel_buffer(dbuf, 0);
547         }
548 done:
549         return(0);
550 }
551
552 /*
553  * Rebuild the B-Tree for the records residing in the specified buffer.
554  *
555  * Return the number of records recovered.
556  */
557 static int
558 hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no)
559 {
560         hammer_record_ondisk_t rec;
561         hammer_buffer_t buffer;
562         int32_t rec_no;
563         int32_t rec_offset;
564         int record_count = 0;
565         int error;
566
567         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
568         if (error) {
569                 /*
570                  * If we are unable to access the buffer leave it in a
571                  * reserved state on the master alist.
572                  */
573                 kprintf("hammer_recover_buffer_stage2: error "
574                         "recovering %d:%d:%d\n",
575                         cluster->volume->vol_no, cluster->clu_no, buf_no);
576                 Debugger("RECOVER BUFFER STAGE2 FAIL");
577                 return(0);
578         }
579
580         /*
581          * Recover the buffer, scan and validate allocated records.  Records
582          * which cannot be recovered are freed.
583          */
584         rec_no = 0;
585         for (;;) {
586                 rec_no = hammer_alist_find(&buffer->alist, rec_no,
587                                            HAMMER_RECORD_NODES, 0);
588                 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
589                         break;
590                 rec_offset = offsetof(union hammer_fsbuf_ondisk,
591                                       record.recs[rec_no]);
592                 rec_offset += buf_no * HAMMER_BUFSIZE;
593                 rec = &buffer->ondisk->record.recs[rec_no];
594                 error = hammer_recover_btree(cluster, buffer, rec_offset, rec);
595                 if (error) {
596                         kprintf("hammer_recover_btree: failed %d:%d@%08x "
597                                 "error %d buffer %p rec %p rec_no %d "
598                                 " cluster_free %d\n",
599                                 cluster->clu_no, buffer->buf_no, rec_offset,
600                                 error, buffer, rec, rec_no,
601                                 cluster->alist_master.meta->bm_alist_freeblks
602                         );
603                         Debugger("recover_btree failed");
604                         /* XXX free the record and its data? */
605                         /*hammer_alist_free(&buffer->alist, rec_no, 1);*/
606                 } else {
607                         ++record_count;
608                 }
609                 ++rec_no;
610         }
611         hammer_rel_buffer(buffer, 0);
612         return(record_count);
613 }
614
615 /*
616  * Enter a single record into the B-Tree.
617  */
618 static int
619 hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer,
620                       int32_t rec_offset, hammer_record_ondisk_t rec)
621 {
622         struct hammer_cursor cursor;
623         union hammer_btree_elm elm;
624         hammer_cluster_t ncluster;
625         int error = 0;
626
627         /*
628          * Check for a spike record.  When spiking into a new cluster do
629          * NOT allow a recursive recovery to occur.  We use a lot of 
630          * stack and the only thing we actually modify in the target
631          * cluster is its parent pointer.
632          */
633         if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) {
634                 hammer_volume_t ovolume = cluster->volume;
635                 hammer_volume_t nvolume;
636
637                 nvolume = hammer_get_volume(ovolume->hmp, rec->spike.vol_no,
638                                             &error);
639                 if (error) {
640                         Debugger("recover_btree1");
641                         return(error);
642                 }
643                 ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no,
644                                               &error, GET_CLUSTER_NORECOVER);
645                 hammer_rel_volume(nvolume, 0);
646                 if (error) {
647                         Debugger("recover_btree2");
648                         return(error);
649                 }
650
651                 /*
652                  * Validate the cluster.  Allow the offset to be fixed up.
653                  */
654                 if (ncluster->ondisk->clu_btree_parent_vol_no != ovolume->vol_no ||
655                     ncluster->ondisk->clu_btree_parent_clu_no != cluster->clu_no) {
656                         kprintf("hammer_recover: Bad cluster spike hookup: "
657                                 "%d:%d != %d:%d\n",
658                                 ncluster->ondisk->clu_btree_parent_vol_no,
659                                 ncluster->ondisk->clu_btree_parent_clu_no,
660                                 ovolume->vol_no,
661                                 cluster->clu_no);
662                         error = EINVAL;
663                         hammer_rel_cluster(ncluster, 0);
664                         Debugger("recover_btree3");
665                         return(error);
666                 }
667         } else {
668                 ncluster = NULL;
669         }
670
671         /*
672          * Locate the insertion point.  Note that we are using the cluster-
673          * localized cursor init so parent will start out NULL.
674          *
675          * The key(s) used for spike's are bounds and different from the
676          * key embedded in the spike record.  A special B-Tree insertion
677          * call is made to deal with spikes.
678          */
679         error = hammer_init_cursor_cluster(&cursor, cluster);
680         if (error) {
681                 Debugger("recover_btree6");
682                 goto failed;
683         }
684         KKASSERT(cursor.node);
685         if (ncluster)
686                 cursor.key_beg = ncluster->ondisk->clu_btree_beg;
687         else
688                 cursor.key_beg = rec->base.base;
689         cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER;
690
691         error = hammer_btree_lookup(&cursor);
692         KKASSERT(error != EDEADLK);
693         KKASSERT(cursor.node);
694         if (error == 0) {
695                 kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n",
696                         &cursor, rec, ncluster);
697                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index);
698                 Debugger("duplicate record");
699         }
700         if (error != ENOENT) {
701                 Debugger("recover_btree5");
702                 goto failed;
703         }
704
705
706         if (ncluster) {
707                 /*
708                  * Spike record
709                  */
710                 kprintf("recover spike clu %d %016llx-%016llx clusterfree %d\n",
711                         ncluster->clu_no,
712                         ncluster->ondisk->clu_btree_beg.obj_id,
713                         ncluster->ondisk->clu_btree_end.obj_id,
714                         cluster->alist_master.meta->bm_alist_freeblks);
715                 error = hammer_btree_insert_cluster(&cursor, ncluster,
716                                                     rec_offset);
717                 kprintf("recover spike record error %d clusterfree %d\n",
718                         error, 
719                         cluster->alist_master.meta->bm_alist_freeblks);
720                 KKASSERT(error != EDEADLK);
721                 if (error)
722                         Debugger("spike recovery");
723         } else {
724                 /*
725                  * Normal record
726                  */
727 #if 0
728                 kprintf("recover recrd clu %d %016llx\n",
729                         cluster->clu_no, rec->base.base.obj_id);
730 #endif
731                 elm.leaf.base = rec->base.base;
732                 elm.leaf.rec_offset = rec_offset;
733                 elm.leaf.data_offset = rec->base.data_offset;
734                 elm.leaf.data_len = rec->base.data_len;
735                 elm.leaf.data_crc = rec->base.data_crc;
736
737                 error = hammer_btree_insert(&cursor, &elm);
738                 KKASSERT(error != EDEADLK);
739         }
740
741         /*
742          * Success if error is 0!
743          */
744         if (error == 0) {
745                 /*
746                  * Update the cluster header's statistics count.  stat_records
747                  * is very important for proper reservation of B-Tree space.
748                  * Note that a spike record counts as 2.
749                  */
750                 ++cluster->ondisk->stat_records;
751                 if (rec->base.base.rec_type == HAMMER_RECTYPE_INODE)
752                         ++cluster->ondisk->stat_inodes;
753                 if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER)
754                         ++cluster->ondisk->stat_records;
755         }
756         if (error) {
757                 kprintf("hammer_recover_btree: insertion failed\n");
758         }
759
760 failed:
761         if (ncluster)
762                 hammer_rel_cluster(ncluster, 0);
763         hammer_done_cursor(&cursor);
764         return(error);
765 }
766
767 #endif