HAMMER 28/many: Implement zoned blockmap
[dragonfly.git] / sbin / hammer / ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sbin/hammer/ondisk.c,v 1.11 2008/02/10 09:50:55 dillon Exp $
35  */
36
37 #include <sys/types.h>
38 #include <assert.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stdarg.h>
42 #include <string.h>
43 #include <unistd.h>
44 #include <err.h>
45 #include <fcntl.h>
46 #include "hammer_util.h"
47
48 static void *alloc_blockmap(int zone, int bytes, hammer_off_t *result_offp,
49                        struct buffer_info **bufferp);
50 static hammer_off_t alloc_bigblock(void);
51 #if 0
52 static void init_fifo_head(hammer_fifo_head_t head, u_int16_t hdr_type);
53 static hammer_off_t hammer_alloc_fifo(int32_t base_bytes, int32_t ext_bytes,
54                         struct buffer_info **bufp, u_int16_t hdr_type);
55 static void readhammerbuf(struct volume_info *vol, void *data,
56                         int64_t offset);
57 #endif
58 static void writehammerbuf(struct volume_info *vol, const void *data,
59                         int64_t offset);
60
61
62 uuid_t Hammer_FSType;
63 uuid_t Hammer_FSId;
64 int64_t BootAreaSize;
65 int64_t MemAreaSize;
66 int     UsingSuperClusters;
67 int     NumVolumes;
68 int     RootVolNo = -1;
69 struct volume_list VolList = TAILQ_HEAD_INITIALIZER(VolList);
70
71 /*
72  * Lookup the requested information structure and related on-disk buffer.
73  * Missing structures are created.
74  */
75 struct volume_info *
76 setup_volume(int32_t vol_no, const char *filename, int isnew, int oflags)
77 {
78         struct volume_info *vol;
79         struct volume_info *scan;
80         struct hammer_volume_ondisk *ondisk;
81         int n;
82
83         /*
84          * Allocate the volume structure
85          */
86         vol = malloc(sizeof(*vol));
87         bzero(vol, sizeof(*vol));
88         TAILQ_INIT(&vol->buffer_list);
89         vol->name = strdup(filename);
90         vol->fd = open(filename, oflags);
91         if (vol->fd < 0) {
92                 free(vol->name);
93                 free(vol);
94                 err(1, "setup_volume: %s: Open failed", filename);
95         }
96
97         /*
98          * Read or initialize the volume header
99          */
100         vol->ondisk = ondisk = malloc(HAMMER_BUFSIZE);
101         if (isnew) {
102                 bzero(ondisk, HAMMER_BUFSIZE);
103         } else {
104                 n = pread(vol->fd, ondisk, HAMMER_BUFSIZE, 0);
105                 if (n != HAMMER_BUFSIZE) {
106                         err(1, "setup_volume: %s: Read failed at offset 0",
107                             filename);
108                 }
109                 vol_no = ondisk->vol_no;
110                 if (RootVolNo < 0) {
111                         RootVolNo = ondisk->vol_rootvol;
112                 } else if (RootVolNo != (int)ondisk->vol_rootvol) {
113                         errx(1, "setup_volume: %s: root volume disagreement: "
114                                 "%d vs %d",
115                                 vol->name, RootVolNo, ondisk->vol_rootvol);
116                 }
117
118                 if (bcmp(&Hammer_FSType, &ondisk->vol_fstype, sizeof(Hammer_FSType)) != 0) {
119                         errx(1, "setup_volume: %s: Header does not indicate "
120                                 "that this is a hammer volume", vol->name);
121                 }
122                 if (TAILQ_EMPTY(&VolList)) {
123                         Hammer_FSId = vol->ondisk->vol_fsid;
124                 } else if (bcmp(&Hammer_FSId, &ondisk->vol_fsid, sizeof(Hammer_FSId)) != 0) {
125                         errx(1, "setup_volume: %s: FSId does match other "
126                                 "volumes!", vol->name);
127                 }
128         }
129         vol->vol_no = vol_no;
130
131         if (isnew) {
132                 /*init_fifo_head(&ondisk->head, HAMMER_HEAD_TYPE_VOL);*/
133                 vol->cache.modified = 1;
134         }
135
136         /*
137          * Link the volume structure in
138          */
139         TAILQ_FOREACH(scan, &VolList, entry) {
140                 if (scan->vol_no == vol_no) {
141                         errx(1, "setup_volume %s: Duplicate volume number %d "
142                                 "against %s", filename, vol_no, scan->name);
143                 }
144         }
145         TAILQ_INSERT_TAIL(&VolList, vol, entry);
146         return(vol);
147 }
148
149 struct volume_info *
150 get_volume(int32_t vol_no)
151 {
152         struct volume_info *vol;
153
154         TAILQ_FOREACH(vol, &VolList, entry) {
155                 if (vol->vol_no == vol_no)
156                         break;
157         }
158         if (vol == NULL)
159                 errx(1, "get_volume: Volume %d does not exist!", vol_no);
160         ++vol->cache.refs;
161         /* not added to or removed from hammer cache */
162         return(vol);
163 }
164
165 void
166 rel_volume(struct volume_info *volume)
167 {
168         /* not added to or removed from hammer cache */
169         --volume->cache.refs;
170 }
171
172 /*
173  * Acquire the specified buffer.
174  */
175 struct buffer_info *
176 get_buffer(hammer_off_t buf_offset, int isnew)
177 {
178         void *ondisk;
179         struct buffer_info *buf;
180         struct volume_info *volume;
181         int n;
182         int vol_no;
183
184         assert((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
185
186         vol_no = HAMMER_VOL_DECODE(buf_offset);
187         volume = get_volume(vol_no);
188         buf_offset &= ~HAMMER_BUFMASK64;
189
190         TAILQ_FOREACH(buf, &volume->buffer_list, entry) {
191                 if (buf->buf_offset == buf_offset)
192                         break;
193         }
194         if (buf == NULL) {
195                 buf = malloc(sizeof(*buf));
196                 bzero(buf, sizeof(*buf));
197                 buf->buf_offset = buf_offset;
198                 buf->buf_disk_offset = volume->ondisk->vol_buf_beg +
199                                         (buf_offset & HAMMER_OFF_SHORT_MASK);
200                 buf->volume = volume;
201                 TAILQ_INSERT_TAIL(&volume->buffer_list, buf, entry);
202                 ++volume->cache.refs;
203                 buf->cache.u.buffer = buf;
204                 hammer_cache_add(&buf->cache, ISBUFFER);
205         }
206         ++buf->cache.refs;
207         hammer_cache_flush();
208         if ((ondisk = buf->ondisk) == NULL) {
209                 buf->ondisk = ondisk = malloc(HAMMER_BUFSIZE);
210                 if (isnew == 0) {
211                         n = pread(volume->fd, ondisk, HAMMER_BUFSIZE,
212                                   buf->buf_disk_offset);
213                         if (n != HAMMER_BUFSIZE) {
214                                 err(1, "get_buffer: %s:%016llx Read failed at "
215                                        "offset %lld",
216                                     volume->name, buf->buf_offset,
217                                     buf->buf_disk_offset);
218                         }
219                 }
220         }
221         if (isnew) {
222                 bzero(ondisk, HAMMER_BUFSIZE);
223                 buf->cache.modified = 1;
224         }
225         return(buf);
226 }
227
228 void
229 rel_buffer(struct buffer_info *buffer)
230 {
231         struct volume_info *volume;
232
233         assert(buffer->cache.refs > 0);
234         if (--buffer->cache.refs == 0) {
235                 if (buffer->cache.delete) {
236                         volume = buffer->volume;
237                         if (buffer->cache.modified)
238                                 flush_buffer(buffer);
239                         TAILQ_REMOVE(&volume->buffer_list, buffer, entry);
240                         hammer_cache_del(&buffer->cache);
241                         free(buffer->ondisk);
242                         free(buffer);
243                         rel_volume(volume);
244                 }
245         }
246 }
247
248 void *
249 get_buffer_data(hammer_off_t buf_offset, struct buffer_info **bufferp,
250                 int isnew)
251 {
252         struct buffer_info *buffer;
253
254         if (*bufferp) {
255                 rel_buffer(*bufferp);
256         }
257         buffer = *bufferp = get_buffer(buf_offset, isnew);
258         return((char *)buffer->ondisk + ((int32_t)buf_offset & HAMMER_BUFMASK));
259 }
260
261 /*
262  * Retrieve a pointer to a B-Tree node given a cluster offset.  The underlying
263  * bufp is freed if non-NULL and a referenced buffer is loaded into it.
264  */
265 hammer_node_ondisk_t
266 get_node(hammer_off_t node_offset, struct buffer_info **bufp)
267 {
268         struct buffer_info *buf;
269
270         if (*bufp)
271                 rel_buffer(*bufp);
272         *bufp = buf = get_buffer(node_offset, 0);
273         return((void *)((char *)buf->ondisk +
274                         (int32_t)(node_offset & HAMMER_BUFMASK)));
275 }
276
277 /*
278  * Allocate HAMMER elements - btree nodes, data storage, and record elements
279  *
280  * NOTE: hammer_alloc_fifo() initializes the fifo header for the returned
281  * item and zero's out the remainder, so don't bzero() it.
282  */
283 void *
284 alloc_btree_element(hammer_off_t *offp)
285 {
286         struct buffer_info *buffer = NULL;
287         hammer_node_ondisk_t node;
288
289         node = alloc_blockmap(HAMMER_ZONE_BTREE_INDEX, sizeof(*node),
290                               offp, &buffer);
291         bzero(node, sizeof(*node));
292         /* XXX buffer not released, pointer remains valid */
293         return(node);
294 }
295
296 hammer_record_ondisk_t
297 alloc_record_element(hammer_off_t *offp, int32_t data_len, void **datap)
298 {
299         struct buffer_info *record_buffer = NULL;
300         struct buffer_info *data_buffer = NULL;
301         hammer_record_ondisk_t rec;
302
303         rec = alloc_blockmap(HAMMER_ZONE_RECORD_INDEX, sizeof(*rec),
304                              offp, &record_buffer);
305         bzero(rec, sizeof(*rec));
306
307         if (data_len >= HAMMER_BUFSIZE) {
308                 assert(data_len <= HAMMER_BUFSIZE); /* just one buffer */
309                 *datap = alloc_blockmap(HAMMER_ZONE_LARGE_DATA_INDEX, data_len,
310                                         &rec->base.data_off, &data_buffer);
311                 rec->base.data_len = data_len;
312                 bzero(*datap, data_len);
313         } else if (data_len) {
314                 *datap = alloc_blockmap(HAMMER_ZONE_SMALL_DATA_INDEX, data_len,
315                                         &rec->base.data_off, &data_buffer);
316                 rec->base.data_len = data_len;
317                 bzero(*datap, data_len);
318         } else {
319                 *datap = NULL;
320         }
321         /* XXX buf not released, ptr remains valid */
322         return(rec);
323 }
324
325 /*
326  * Format a new blockmap
327  */
328 void
329 format_blockmap(hammer_blockmap_entry_t blockmap, hammer_off_t zone_off)
330 {
331         blockmap->phys_offset = alloc_bigblock();
332         blockmap->alloc_offset = zone_off;
333 }
334
335 static
336 void *
337 alloc_blockmap(int zone, int bytes, hammer_off_t *result_offp,
338                struct buffer_info **bufferp)
339 {
340         struct buffer_info *buffer;
341         struct volume_info *volume;
342         hammer_blockmap_entry_t rootmap;
343         hammer_blockmap_entry_t blockmap;
344         void *ptr;
345         int i;
346
347         volume = get_volume(RootVolNo);
348
349         rootmap = &volume->ondisk->vol0_blockmap[zone];
350
351         /*
352          * Alignment and buffer-boundary issues
353          */
354         bytes = (bytes + 7) & ~7;
355         if ((rootmap->phys_offset ^ (rootmap->phys_offset + bytes - 1)) &
356             ~HAMMER_BUFMASK64) {
357                 volume->cache.modified = 1;
358                 rootmap->phys_offset = (rootmap->phys_offset + bytes) &
359                                        ~HAMMER_BUFMASK64;
360         }
361
362         /*
363          * Dive layer 2
364          */
365         i = (rootmap->alloc_offset >> (HAMMER_LARGEBLOCK_BITS +
366              HAMMER_BLOCKMAP_BITS)) & HAMMER_BLOCKMAP_RADIX_MASK;
367
368         blockmap = get_buffer_data(rootmap->phys_offset + i * sizeof(*blockmap),
369                                    bufferp, 0);
370         buffer = *bufferp;
371         if ((rootmap->alloc_offset & HAMMER_LARGEBLOCK_LAYER1_MASK) == 0) {
372                 buffer->cache.modified = 1;
373                 bzero(blockmap, sizeof(*blockmap));
374                 blockmap->phys_offset = alloc_bigblock();
375         }
376
377         /*
378          * Dive layer 1
379          */
380         i = (rootmap->alloc_offset >> HAMMER_LARGEBLOCK_BITS) &
381             HAMMER_BLOCKMAP_RADIX_MASK;
382
383         blockmap = get_buffer_data(
384                 blockmap->phys_offset + i * sizeof(*blockmap), bufferp, 0);
385         buffer = *bufferp;
386
387         if ((rootmap->alloc_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
388                 buffer->cache.modified = 1;
389                 bzero(blockmap, sizeof(*blockmap));
390                 blockmap->phys_offset = alloc_bigblock();
391                 blockmap->bytes_free = HAMMER_LARGEBLOCK_SIZE;
392         }
393
394         buffer->cache.modified = 1;
395         volume->cache.modified = 1;
396         blockmap->bytes_free -= bytes;
397         *result_offp = rootmap->alloc_offset;
398         rootmap->alloc_offset += bytes;
399
400         i = (rootmap->phys_offset >> HAMMER_BUFFER_BITS) &
401             HAMMER_BUFFERS_PER_LARGEBLOCK_MASK;
402         ptr = get_buffer_data(
403                 blockmap->phys_offset + i * HAMMER_BUFSIZE +
404                  ((int32_t)*result_offp & HAMMER_BUFMASK), bufferp, 0);
405         buffer->cache.modified = 1;
406
407         rel_volume(volume);
408         return(ptr);
409 }
410
411 static
412 hammer_off_t
413 alloc_bigblock(void)
414 {
415         struct volume_info *volume;
416         hammer_off_t result_offset;
417
418         volume = get_volume(RootVolNo);
419         result_offset = volume->ondisk->vol0_free_off;
420         volume->ondisk->vol0_free_off += HAMMER_LARGEBLOCK_SIZE;
421         if ((volume->ondisk->vol0_free_off & HAMMER_OFF_SHORT_MASK) >
422             (hammer_off_t)(volume->ondisk->vol_buf_end - volume->ondisk->vol_buf_beg)) {
423                 panic("alloc_bigblock: Ran out of room, filesystem too small");
424         }
425         rel_volume(volume);
426         return(result_offset);
427 }
428
429 #if 0
430 /*
431  * Reserve space from the FIFO.  Make sure that bytes does not cross a 
432  * record boundary.
433  *
434  * Zero out base_bytes and initialize the fifo head and tail.  The
435  * data area is not zerod.
436  */
437 static
438 hammer_off_t
439 hammer_alloc_fifo(int32_t base_bytes, int32_t ext_bytes,
440                   struct buffer_info **bufp, u_int16_t hdr_type)
441 {
442         struct buffer_info *buf;
443         struct volume_info *volume;
444         hammer_fifo_head_t head;
445         hammer_fifo_tail_t tail;
446         hammer_off_t off;
447         int32_t aligned_bytes;
448
449         aligned_bytes = (base_bytes + ext_bytes + HAMMER_TAIL_ONDISK_SIZE +
450                          HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK;
451
452         volume = get_volume(RootVolNo);
453         off = volume->ondisk->vol0_fifo_end;
454
455         /*
456          * For now don't deal with transitions across buffer boundaries,
457          * only newfs_hammer uses this function.
458          */
459         assert((off & ~HAMMER_BUFMASK64) ==
460                 ((off + aligned_bytes) & ~HAMMER_BUFMASK));
461
462         *bufp = buf = get_buffer(off, 0);
463
464         buf->cache.modified = 1;
465         volume->cache.modified = 1;
466
467         head = (void *)((char *)buf->ondisk + ((int32_t)off & HAMMER_BUFMASK));
468         bzero(head, base_bytes);
469
470         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
471         head->hdr_type = hdr_type;
472         head->hdr_size = aligned_bytes;
473         head->hdr_seq = volume->ondisk->vol0_next_seq++;
474
475         tail = (void*)((char *)head + aligned_bytes - HAMMER_TAIL_ONDISK_SIZE);
476         tail->tail_signature = HAMMER_TAIL_SIGNATURE;
477         tail->tail_type = hdr_type;
478         tail->tail_size = aligned_bytes;
479
480         volume->ondisk->vol0_fifo_end += aligned_bytes;
481         volume->cache.modified = 1;
482
483         rel_volume(volume);
484
485         return(off);
486 }
487
488 #endif
489
490 /*
491  * Flush various tracking structures to disk
492  */
493
494 /*
495  * Flush various tracking structures to disk
496  */
497 void
498 flush_all_volumes(void)
499 {
500         struct volume_info *vol;
501
502         TAILQ_FOREACH(vol, &VolList, entry)
503                 flush_volume(vol);
504 }
505
506 void
507 flush_volume(struct volume_info *volume)
508 {
509         struct buffer_info *buffer;
510
511         TAILQ_FOREACH(buffer, &volume->buffer_list, entry)
512                 flush_buffer(buffer);
513         writehammerbuf(volume, volume->ondisk, 0);
514         volume->cache.modified = 0;
515 }
516
517 void
518 flush_buffer(struct buffer_info *buffer)
519 {
520         writehammerbuf(buffer->volume, buffer->ondisk, buffer->buf_disk_offset);
521         buffer->cache.modified = 0;
522 }
523
524 #if 0
525 /*
526  * Generic buffer initialization
527  */
528 static void
529 init_fifo_head(hammer_fifo_head_t head, u_int16_t hdr_type)
530 {
531         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
532         head->hdr_type = hdr_type;
533         head->hdr_size = 0;
534         head->hdr_crc = 0;
535         head->hdr_seq = 0;
536 }
537
538 #endif
539
540 #if 0
541 /*
542  * Core I/O operations
543  */
544 static void
545 readhammerbuf(struct volume_info *vol, void *data, int64_t offset)
546 {
547         ssize_t n;
548
549         n = pread(vol->fd, data, HAMMER_BUFSIZE, offset);
550         if (n != HAMMER_BUFSIZE)
551                 err(1, "Read volume %d (%s)", vol->vol_no, vol->name);
552 }
553
554 #endif
555
556 static void
557 writehammerbuf(struct volume_info *vol, const void *data, int64_t offset)
558 {
559         ssize_t n;
560
561         n = pwrite(vol->fd, data, HAMMER_BUFSIZE, offset);
562         if (n != HAMMER_BUFSIZE)
563                 err(1, "Write volume %d (%s)", vol->vol_no, vol->name);
564 }
565
566 void
567 panic(const char *ctl, ...)
568 {
569         va_list va;
570
571         va_start(va, ctl);
572         vfprintf(stderr, ctl, va);
573         va_end(va);
574         fprintf(stderr, "\n");
575         exit(1);
576 }
577