HAMMER 28/many: Implement zoned blockmap
[dragonfly.git] / sbin / hammer / ondisk.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $DragonFly: src/sbin/hammer/ondisk.c,v 1.11 2008/02/10 09:50:55 dillon Exp $
35 */
36
37#include <sys/types.h>
38#include <assert.h>
39#include <stdio.h>
40#include <stdlib.h>
41#include <stdarg.h>
42#include <string.h>
43#include <unistd.h>
44#include <err.h>
45#include <fcntl.h>
46#include "hammer_util.h"
47
48static void *alloc_blockmap(int zone, int bytes, hammer_off_t *result_offp,
49 struct buffer_info **bufferp);
50static hammer_off_t alloc_bigblock(void);
51#if 0
52static void init_fifo_head(hammer_fifo_head_t head, u_int16_t hdr_type);
53static hammer_off_t hammer_alloc_fifo(int32_t base_bytes, int32_t ext_bytes,
54 struct buffer_info **bufp, u_int16_t hdr_type);
55static void readhammerbuf(struct volume_info *vol, void *data,
56 int64_t offset);
57#endif
58static void writehammerbuf(struct volume_info *vol, const void *data,
59 int64_t offset);
60
61
62uuid_t Hammer_FSType;
63uuid_t Hammer_FSId;
64int64_t BootAreaSize;
65int64_t MemAreaSize;
66int UsingSuperClusters;
67int NumVolumes;
68int RootVolNo = -1;
69struct volume_list VolList = TAILQ_HEAD_INITIALIZER(VolList);
70
71/*
72 * Lookup the requested information structure and related on-disk buffer.
73 * Missing structures are created.
74 */
75struct volume_info *
76setup_volume(int32_t vol_no, const char *filename, int isnew, int oflags)
77{
78 struct volume_info *vol;
79 struct volume_info *scan;
80 struct hammer_volume_ondisk *ondisk;
81 int n;
82
83 /*
84 * Allocate the volume structure
85 */
86 vol = malloc(sizeof(*vol));
87 bzero(vol, sizeof(*vol));
88 TAILQ_INIT(&vol->buffer_list);
89 vol->name = strdup(filename);
90 vol->fd = open(filename, oflags);
91 if (vol->fd < 0) {
92 free(vol->name);
93 free(vol);
94 err(1, "setup_volume: %s: Open failed", filename);
95 }
96
97 /*
98 * Read or initialize the volume header
99 */
100 vol->ondisk = ondisk = malloc(HAMMER_BUFSIZE);
101 if (isnew) {
102 bzero(ondisk, HAMMER_BUFSIZE);
103 } else {
104 n = pread(vol->fd, ondisk, HAMMER_BUFSIZE, 0);
105 if (n != HAMMER_BUFSIZE) {
106 err(1, "setup_volume: %s: Read failed at offset 0",
107 filename);
108 }
109 vol_no = ondisk->vol_no;
110 if (RootVolNo < 0) {
111 RootVolNo = ondisk->vol_rootvol;
112 } else if (RootVolNo != (int)ondisk->vol_rootvol) {
113 errx(1, "setup_volume: %s: root volume disagreement: "
114 "%d vs %d",
115 vol->name, RootVolNo, ondisk->vol_rootvol);
116 }
117
118 if (bcmp(&Hammer_FSType, &ondisk->vol_fstype, sizeof(Hammer_FSType)) != 0) {
119 errx(1, "setup_volume: %s: Header does not indicate "
120 "that this is a hammer volume", vol->name);
121 }
122 if (TAILQ_EMPTY(&VolList)) {
123 Hammer_FSId = vol->ondisk->vol_fsid;
124 } else if (bcmp(&Hammer_FSId, &ondisk->vol_fsid, sizeof(Hammer_FSId)) != 0) {
125 errx(1, "setup_volume: %s: FSId does match other "
126 "volumes!", vol->name);
127 }
128 }
129 vol->vol_no = vol_no;
130
131 if (isnew) {
132 /*init_fifo_head(&ondisk->head, HAMMER_HEAD_TYPE_VOL);*/
133 vol->cache.modified = 1;
134 }
135
136 /*
137 * Link the volume structure in
138 */
139 TAILQ_FOREACH(scan, &VolList, entry) {
140 if (scan->vol_no == vol_no) {
141 errx(1, "setup_volume %s: Duplicate volume number %d "
142 "against %s", filename, vol_no, scan->name);
143 }
144 }
145 TAILQ_INSERT_TAIL(&VolList, vol, entry);
146 return(vol);
147}
148
149struct volume_info *
150get_volume(int32_t vol_no)
151{
152 struct volume_info *vol;
153
154 TAILQ_FOREACH(vol, &VolList, entry) {
155 if (vol->vol_no == vol_no)
156 break;
157 }
158 if (vol == NULL)
159 errx(1, "get_volume: Volume %d does not exist!", vol_no);
160 ++vol->cache.refs;
161 /* not added to or removed from hammer cache */
162 return(vol);
163}
164
165void
166rel_volume(struct volume_info *volume)
167{
168 /* not added to or removed from hammer cache */
169 --volume->cache.refs;
170}
171
172/*
173 * Acquire the specified buffer.
174 */
175struct buffer_info *
176get_buffer(hammer_off_t buf_offset, int isnew)
177{
178 void *ondisk;
179 struct buffer_info *buf;
180 struct volume_info *volume;
181 int n;
182 int vol_no;
183
184 assert((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
185
186 vol_no = HAMMER_VOL_DECODE(buf_offset);
187 volume = get_volume(vol_no);
188 buf_offset &= ~HAMMER_BUFMASK64;
189
190 TAILQ_FOREACH(buf, &volume->buffer_list, entry) {
191 if (buf->buf_offset == buf_offset)
192 break;
193 }
194 if (buf == NULL) {
195 buf = malloc(sizeof(*buf));
196 bzero(buf, sizeof(*buf));
197 buf->buf_offset = buf_offset;
198 buf->buf_disk_offset = volume->ondisk->vol_buf_beg +
199 (buf_offset & HAMMER_OFF_SHORT_MASK);
200 buf->volume = volume;
201 TAILQ_INSERT_TAIL(&volume->buffer_list, buf, entry);
202 ++volume->cache.refs;
203 buf->cache.u.buffer = buf;
204 hammer_cache_add(&buf->cache, ISBUFFER);
205 }
206 ++buf->cache.refs;
207 hammer_cache_flush();
208 if ((ondisk = buf->ondisk) == NULL) {
209 buf->ondisk = ondisk = malloc(HAMMER_BUFSIZE);
210 if (isnew == 0) {
211 n = pread(volume->fd, ondisk, HAMMER_BUFSIZE,
212 buf->buf_disk_offset);
213 if (n != HAMMER_BUFSIZE) {
214 err(1, "get_buffer: %s:%016llx Read failed at "
215 "offset %lld",
216 volume->name, buf->buf_offset,
217 buf->buf_disk_offset);
218 }
219 }
220 }
221 if (isnew) {
222 bzero(ondisk, HAMMER_BUFSIZE);
223 buf->cache.modified = 1;
224 }
225 return(buf);
226}
227
228void
229rel_buffer(struct buffer_info *buffer)
230{
231 struct volume_info *volume;
232
233 assert(buffer->cache.refs > 0);
234 if (--buffer->cache.refs == 0) {
235 if (buffer->cache.delete) {
236 volume = buffer->volume;
237 if (buffer->cache.modified)
238 flush_buffer(buffer);
239 TAILQ_REMOVE(&volume->buffer_list, buffer, entry);
240 hammer_cache_del(&buffer->cache);
241 free(buffer->ondisk);
242 free(buffer);
243 rel_volume(volume);
244 }
245 }
246}
247
248void *
249get_buffer_data(hammer_off_t buf_offset, struct buffer_info **bufferp,
250 int isnew)
251{
252 struct buffer_info *buffer;
253
254 if (*bufferp) {
255 rel_buffer(*bufferp);
256 }
257 buffer = *bufferp = get_buffer(buf_offset, isnew);
258 return((char *)buffer->ondisk + ((int32_t)buf_offset & HAMMER_BUFMASK));
259}
260
261/*
262 * Retrieve a pointer to a B-Tree node given a cluster offset. The underlying
263 * bufp is freed if non-NULL and a referenced buffer is loaded into it.
264 */
265hammer_node_ondisk_t
266get_node(hammer_off_t node_offset, struct buffer_info **bufp)
267{
268 struct buffer_info *buf;
269
270 if (*bufp)
271 rel_buffer(*bufp);
272 *bufp = buf = get_buffer(node_offset, 0);
273 return((void *)((char *)buf->ondisk +
274 (int32_t)(node_offset & HAMMER_BUFMASK)));
275}
276
277/*
278 * Allocate HAMMER elements - btree nodes, data storage, and record elements
279 *
280 * NOTE: hammer_alloc_fifo() initializes the fifo header for the returned
281 * item and zero's out the remainder, so don't bzero() it.
282 */
283void *
284alloc_btree_element(hammer_off_t *offp)
285{
286 struct buffer_info *buffer = NULL;
287 hammer_node_ondisk_t node;
288
289 node = alloc_blockmap(HAMMER_ZONE_BTREE_INDEX, sizeof(*node),
290 offp, &buffer);
291 bzero(node, sizeof(*node));
292 /* XXX buffer not released, pointer remains valid */
293 return(node);
294}
295
296hammer_record_ondisk_t
297alloc_record_element(hammer_off_t *offp, int32_t data_len, void **datap)
298{
299 struct buffer_info *record_buffer = NULL;
300 struct buffer_info *data_buffer = NULL;
301 hammer_record_ondisk_t rec;
302
303 rec = alloc_blockmap(HAMMER_ZONE_RECORD_INDEX, sizeof(*rec),
304 offp, &record_buffer);
305 bzero(rec, sizeof(*rec));
306
307 if (data_len >= HAMMER_BUFSIZE) {
308 assert(data_len <= HAMMER_BUFSIZE); /* just one buffer */
309 *datap = alloc_blockmap(HAMMER_ZONE_LARGE_DATA_INDEX, data_len,
310 &rec->base.data_off, &data_buffer);
311 rec->base.data_len = data_len;
312 bzero(*datap, data_len);
313 } else if (data_len) {
314 *datap = alloc_blockmap(HAMMER_ZONE_SMALL_DATA_INDEX, data_len,
315 &rec->base.data_off, &data_buffer);
316 rec->base.data_len = data_len;
317 bzero(*datap, data_len);
318 } else {
319 *datap = NULL;
320 }
321 /* XXX buf not released, ptr remains valid */
322 return(rec);
323}
324
325/*
326 * Format a new blockmap
327 */
328void
329format_blockmap(hammer_blockmap_entry_t blockmap, hammer_off_t zone_off)
330{
331 blockmap->phys_offset = alloc_bigblock();
332 blockmap->alloc_offset = zone_off;
333}
334
335static
336void *
337alloc_blockmap(int zone, int bytes, hammer_off_t *result_offp,
338 struct buffer_info **bufferp)
339{
340 struct buffer_info *buffer;
341 struct volume_info *volume;
342 hammer_blockmap_entry_t rootmap;
343 hammer_blockmap_entry_t blockmap;
344 void *ptr;
345 int i;
346
347 volume = get_volume(RootVolNo);
348
349 rootmap = &volume->ondisk->vol0_blockmap[zone];
350
351 /*
352 * Alignment and buffer-boundary issues
353 */
354 bytes = (bytes + 7) & ~7;
355 if ((rootmap->phys_offset ^ (rootmap->phys_offset + bytes - 1)) &
356 ~HAMMER_BUFMASK64) {
357 volume->cache.modified = 1;
358 rootmap->phys_offset = (rootmap->phys_offset + bytes) &
359 ~HAMMER_BUFMASK64;
360 }
361
362 /*
363 * Dive layer 2
364 */
365 i = (rootmap->alloc_offset >> (HAMMER_LARGEBLOCK_BITS +
366 HAMMER_BLOCKMAP_BITS)) & HAMMER_BLOCKMAP_RADIX_MASK;
367
368 blockmap = get_buffer_data(rootmap->phys_offset + i * sizeof(*blockmap),
369 bufferp, 0);
370 buffer = *bufferp;
371 if ((rootmap->alloc_offset & HAMMER_LARGEBLOCK_LAYER1_MASK) == 0) {
372 buffer->cache.modified = 1;
373 bzero(blockmap, sizeof(*blockmap));
374 blockmap->phys_offset = alloc_bigblock();
375 }
376
377 /*
378 * Dive layer 1
379 */
380 i = (rootmap->alloc_offset >> HAMMER_LARGEBLOCK_BITS) &
381 HAMMER_BLOCKMAP_RADIX_MASK;
382
383 blockmap = get_buffer_data(
384 blockmap->phys_offset + i * sizeof(*blockmap), bufferp, 0);
385 buffer = *bufferp;
386
387 if ((rootmap->alloc_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
388 buffer->cache.modified = 1;
389 bzero(blockmap, sizeof(*blockmap));
390 blockmap->phys_offset = alloc_bigblock();
391 blockmap->bytes_free = HAMMER_LARGEBLOCK_SIZE;
392 }
393
394 buffer->cache.modified = 1;
395 volume->cache.modified = 1;
396 blockmap->bytes_free -= bytes;
397 *result_offp = rootmap->alloc_offset;
398 rootmap->alloc_offset += bytes;
399
400 i = (rootmap->phys_offset >> HAMMER_BUFFER_BITS) &
401 HAMMER_BUFFERS_PER_LARGEBLOCK_MASK;
402 ptr = get_buffer_data(
403 blockmap->phys_offset + i * HAMMER_BUFSIZE +
404 ((int32_t)*result_offp & HAMMER_BUFMASK), bufferp, 0);
405 buffer->cache.modified = 1;
406
407 rel_volume(volume);
408 return(ptr);
409}
410
411static
412hammer_off_t
413alloc_bigblock(void)
414{
415 struct volume_info *volume;
416 hammer_off_t result_offset;
417
418 volume = get_volume(RootVolNo);
419 result_offset = volume->ondisk->vol0_free_off;
420 volume->ondisk->vol0_free_off += HAMMER_LARGEBLOCK_SIZE;
421 if ((volume->ondisk->vol0_free_off & HAMMER_OFF_SHORT_MASK) >
422 (hammer_off_t)(volume->ondisk->vol_buf_end - volume->ondisk->vol_buf_beg)) {
423 panic("alloc_bigblock: Ran out of room, filesystem too small");
424 }
425 rel_volume(volume);
426 return(result_offset);
427}
428
429#if 0
430/*
431 * Reserve space from the FIFO. Make sure that bytes does not cross a
432 * record boundary.
433 *
434 * Zero out base_bytes and initialize the fifo head and tail. The
435 * data area is not zerod.
436 */
437static
438hammer_off_t
439hammer_alloc_fifo(int32_t base_bytes, int32_t ext_bytes,
440 struct buffer_info **bufp, u_int16_t hdr_type)
441{
442 struct buffer_info *buf;
443 struct volume_info *volume;
444 hammer_fifo_head_t head;
445 hammer_fifo_tail_t tail;
446 hammer_off_t off;
447 int32_t aligned_bytes;
448
449 aligned_bytes = (base_bytes + ext_bytes + HAMMER_TAIL_ONDISK_SIZE +
450 HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK;
451
452 volume = get_volume(RootVolNo);
453 off = volume->ondisk->vol0_fifo_end;
454
455 /*
456 * For now don't deal with transitions across buffer boundaries,
457 * only newfs_hammer uses this function.
458 */
459 assert((off & ~HAMMER_BUFMASK64) ==
460 ((off + aligned_bytes) & ~HAMMER_BUFMASK));
461
462 *bufp = buf = get_buffer(off, 0);
463
464 buf->cache.modified = 1;
465 volume->cache.modified = 1;
466
467 head = (void *)((char *)buf->ondisk + ((int32_t)off & HAMMER_BUFMASK));
468 bzero(head, base_bytes);
469
470 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
471 head->hdr_type = hdr_type;
472 head->hdr_size = aligned_bytes;
473 head->hdr_seq = volume->ondisk->vol0_next_seq++;
474
475 tail = (void*)((char *)head + aligned_bytes - HAMMER_TAIL_ONDISK_SIZE);
476 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
477 tail->tail_type = hdr_type;
478 tail->tail_size = aligned_bytes;
479
480 volume->ondisk->vol0_fifo_end += aligned_bytes;
481 volume->cache.modified = 1;
482
483 rel_volume(volume);
484
485 return(off);
486}
487
488#endif
489
490/*
491 * Flush various tracking structures to disk
492 */
493
494/*
495 * Flush various tracking structures to disk
496 */
497void
498flush_all_volumes(void)
499{
500 struct volume_info *vol;
501
502 TAILQ_FOREACH(vol, &VolList, entry)
503 flush_volume(vol);
504}
505
506void
507flush_volume(struct volume_info *volume)
508{
509 struct buffer_info *buffer;
510
511 TAILQ_FOREACH(buffer, &volume->buffer_list, entry)
512 flush_buffer(buffer);
513 writehammerbuf(volume, volume->ondisk, 0);
514 volume->cache.modified = 0;
515}
516
517void
518flush_buffer(struct buffer_info *buffer)
519{
520 writehammerbuf(buffer->volume, buffer->ondisk, buffer->buf_disk_offset);
521 buffer->cache.modified = 0;
522}
523
524#if 0
525/*
526 * Generic buffer initialization
527 */
528static void
529init_fifo_head(hammer_fifo_head_t head, u_int16_t hdr_type)
530{
531 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
532 head->hdr_type = hdr_type;
533 head->hdr_size = 0;
534 head->hdr_crc = 0;
535 head->hdr_seq = 0;
536}
537
538#endif
539
540#if 0
541/*
542 * Core I/O operations
543 */
544static void
545readhammerbuf(struct volume_info *vol, void *data, int64_t offset)
546{
547 ssize_t n;
548
549 n = pread(vol->fd, data, HAMMER_BUFSIZE, offset);
550 if (n != HAMMER_BUFSIZE)
551 err(1, "Read volume %d (%s)", vol->vol_no, vol->name);
552}
553
554#endif
555
556static void
557writehammerbuf(struct volume_info *vol, const void *data, int64_t offset)
558{
559 ssize_t n;
560
561 n = pwrite(vol->fd, data, HAMMER_BUFSIZE, offset);
562 if (n != HAMMER_BUFSIZE)
563 err(1, "Write volume %d (%s)", vol->vol_no, vol->name);
564}
565
566void
567panic(const char *ctl, ...)
568{
569 va_list va;
570
571 va_start(va, ctl);
572 vfprintf(stderr, ctl, va);
573 va_end(va);
574 fprintf(stderr, "\n");
575 exit(1);
576}
577