| Commit | Line | Data |
|---|---|---|
| 919cd235 AH |
1 | /* |
| 2 | * Copyright (c) 2010 The DragonFly Project. All rights reserved. | |
| 3 | * | |
| 4 | * This code is derived from software contributed to The DragonFly Project | |
| 5 | * by Alex Hornung <ahornung@gmail.com> | |
| 6 | * | |
| 7 | * Redistribution and use in source and binary forms, with or without | |
| 8 | * modification, are permitted provided that the following conditions | |
| 9 | * are met: | |
| 10 | * | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in | |
| 15 | * the documentation and/or other materials provided with the | |
| 16 | * distribution. | |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 18 | * contributors may be used to endorse or promote products derived | |
| 19 | * from this software without specific, prior written permission. | |
| 20 | * | |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 32 | * SUCH DAMAGE. | |
| 33 | */ | |
| 34 | ||
| 35 | /* | |
| 36 | * This file implements initial version of a mirror target | |
| 37 | */ | |
| 38 | #include <sys/types.h> | |
| 39 | #include <sys/param.h> | |
| 40 | ||
| 41 | #include <sys/bio.h> | |
| 42 | #include <sys/buf.h> | |
| 43 | #include <sys/malloc.h> | |
| 44 | #include <sys/uuid.h> | |
| 45 | #include <sys/vnode.h> | |
| 46 | ||
| a84e173e | 47 | #include <dev/disk/dm/dm.h> |
| 919cd235 AH |
48 | MALLOC_DEFINE(M_DMDMIRROR, "dm_dmirror", "Device Mapper Target DMIRROR"); |
| 49 | ||
| 50 | /* segdesc flags */ | |
| 51 | #define MEDIA_UNSTABLE 0x0001 | |
| 52 | #define MEDIA_READ_DEGRADED 0x0002 | |
| 53 | #define MEDIA_WRITE_DEGRADED 0x0004 | |
| 54 | #define MEDIA_MASTER 0x0008 | |
| 55 | #define UNINITIALIZED 0x0010 | |
| 56 | #define OLD_UNSTABLE 0x0020 | |
| 57 | #define OLD_MSATER 0x0040 | |
| 58 | ||
| 59 | /* dmirror disk flags */ | |
| 60 | #define DISK_ONLINE 0x0001 | |
| 61 | ||
| 62 | ||
| 63 | #define dmirror_set_bio_disk(bio, x) ((bio)->bio_caller_info1.ptr = (x)) | |
| 64 | #define dmirror_get_bio_disk(bio) ((bio)?((bio)->bio_caller_info1.ptr):NULL) | |
| 65 | #define dmirror_set_bio_seg(bio, x) ((bio)->bio_caller_info2.offset = (x)) | |
| 66 | #define dmirror_get_bio_segno(bio) ((bio)?((bio)->bio_caller_info2.offset):0) | |
| 67 | ||
| 68 | #define dmirror_set_bio_retries(bio, x) ((bio)->bio_caller_info3.value = (x)) | |
| 69 | #define dmirror_get_bio_retries(bio) ((bio)?((bio)->bio_caller_info3.value):0) | |
| 70 | ||
| 71 | #define dmirror_set_bio_mbuf(bio, x) ((bio)->bio_caller_info3.ptr = (x)) | |
| 72 | #define dmirror_get_bio_mbuf(bio) ((bio)?((bio)->bio_caller_info3.ptr):NULL) | |
| 73 | ||
| 74 | ||
| 75 | ||
| 76 | /* Segment descriptor for each logical segment */ | |
| 77 | typedef struct segdesc { | |
| 78 | uint32_t flags; /* Flags, including state */ | |
| 79 | uint32_t zf_bitmap; /* Zero-fill bitmap */ | |
| 80 | uint8_t disk_no; | |
| 81 | uint8_t spare1; | |
| 82 | uint16_t spare2; | |
| 83 | uint32_t spare3; | |
| 84 | /* XXX: some timestamp/serial */ | |
| 85 | } segdesc_t; | |
| 86 | ||
| 87 | typedef struct dmirror_disk { | |
| 88 | uint32_t flags; | |
| 89 | dm_pdev_t *pdev; | |
| 90 | } dmirror_disk_t; | |
| 91 | ||
| 92 | typedef struct target_dmirror_config { | |
| 93 | size_t params_len; | |
| 94 | dmirror_disk_t disks[4]; | |
| 95 | uint8_t ndisks; | |
| 96 | /* XXX: uuid stuff */ | |
| 97 | ||
| 98 | } dm_target_dmirror_config_t; | |
| 99 | ||
| 100 | static | |
| 101 | struct bio* | |
| 102 | dmirror_clone_bio(struct bio *obio) | |
| 103 | { | |
| 104 | struct bio *bio; | |
| 105 | struct buf *mbp; | |
| 106 | struct buf *bp; | |
| 107 | ||
| 108 | mbp = obio->bio_buf; | |
| 109 | bp = getpbuf(NULL); | |
| 110 | ||
| 111 | BUF_KERNPROC(bp); | |
| 112 | bp->b_vp = mbp->b_vp; | |
| 113 | bp->b_cmd = mbp->b_cmd; | |
| 114 | bp->b_data = (char *)mbp->b_data; | |
| 115 | bp->b_resid = bp->b_bcount = mbp->b_bcount; | |
| 116 | bp->b_bufsize = bp->b_bcount; | |
| 117 | ||
| 118 | bio = &bp->b_bio1; | |
| 119 | bio->bio_offset = obio->bio_offset; | |
| 120 | ||
| 121 | return (bio); | |
| 122 | } | |
| 123 | ||
| 124 | static void | |
| 125 | dmirror_write_done(struct bio *bio) | |
| 126 | { | |
| 127 | dmirror_disk_t disk; | |
| 128 | off_t segno; | |
| 129 | struct bio *obio, *mbio; | |
| 130 | int retries; | |
| 131 | ||
| 132 | disk = dmirror_get_bio_disk(bio); | |
| 133 | segno = dmirror_get_bio_segno(bio); | |
| 134 | mbio = dmirror_get_bio_mbuf(bio); | |
| 135 | ||
| 136 | if (bio->bio_buf->b_flags & B_ERROR) { | |
| 137 | /* write failed */ | |
| 138 | } | |
| 139 | ||
| 140 | obio = pop_bio(bio); | |
| 141 | biodone(obio); | |
| 142 | } | |
| 143 | ||
| 144 | void | |
| 145 | dmirror_issue_write(dmirror_disk_t disk, struct bio *bio) | |
| 146 | { | |
| 147 | dmirror_set_bio_disk(bio, disk); | |
| 148 | dmirror_set_bio_segno(bio, SEGNO_FROM_OFFSET(bio->bio_offset)); | |
| 149 | ||
| 150 | bio->bio_done = dmirror_write_done; | |
| 151 | vn_strategy(disk->pdev, bio); | |
| 152 | } | |
| 153 | ||
| 154 | void | |
| 155 | dmirror_write(dm_target_crypt_config_t config, struct bio *bio) | |
| 156 | { | |
| 157 | dmirror_disk_t disk, m_disk; | |
| 158 | struct bio *wbio1, *wbio2; | |
| 159 | segdesc_t segdesc; | |
| 160 | int i, masters = 0; | |
| 161 | ||
| 162 | for(i = 0; i < XXX config->ndisks; i++) { | |
| 163 | disk = &config->disks[i]; | |
| 164 | segdesc = SEGDESC_FROM_OFFSET(disk, bio->bio_offset); | |
| 165 | if (segdesc->flags & MEDIA_MASTER) { | |
| 166 | if (++masters == 1) | |
| 167 | m_disk = disk; | |
| 168 | } | |
| 169 | } | |
| 170 | ||
| 171 | if (masters == 1) { | |
| 172 | dmirror_set_bio_mbuf(bio, NULL); | |
| 173 | dmirror_issue_write(m_disk, bio); | |
| 174 | } else { | |
| 175 | wbio1 = dmirror_clone_bio(bio); | |
| 176 | wbio2 = dmirror_clone_bio(bio); | |
| 177 | dmirror_set_bio_mbuf(wbio1, bio); | |
| 178 | dmirror_set_bio_mbuf(wbio2, bio); | |
| 179 | dmirror_issue_write(XXX disk1, wbio1); | |
| 180 | dmirror_issue_write(XXX disk2, wbio2); | |
| 181 | } | |
| 182 | ||
| 183 | } | |
| 184 | ||
| 185 | static void | |
| 186 | segdesc_set_flag(dmirror_disk_t disk, off_t segno, int flag) | |
| 187 | { | |
| 188 | /* | |
| 189 | * XXX: set the flag on the in-memory descriptor and write back to disks. | |
| 190 | */ | |
| 191 | foo |= flag; | |
| 192 | } | |
| 193 | ||
| 194 | ||
| 195 | static void | |
| 196 | segdesc_clear_flag(dmirror_disk_t disk, off_t segno, int flag) | |
| 197 | { | |
| 198 | /* | |
| 199 | * XXX: set the flag on the in-memory descriptor and write back to disks. | |
| 200 | */ | |
| 201 | foo &= ~flag; | |
| 202 | } | |
| 203 | ||
| 204 | static void | |
| 205 | dmirror_read_done(struct bio *bio) | |
| 206 | { | |
| 207 | dmirror_disk_t disk; | |
| 208 | off_t segno; | |
| 209 | struct bio *obio; | |
| 210 | int retries; | |
| 211 | ||
| 212 | disk = dmirror_get_bio_disk(bio); | |
| 213 | segno = dmirror_get_bio_segno(bio); | |
| 214 | retries = dmirror_get_bio_retries(bio); | |
| 215 | ||
| 216 | if (bio->bio_buf->b_flags & B_ERROR) { | |
| 217 | /* read failed, so redispatch to a different disk */ | |
| 218 | segdesc_set_flag(disk, segno, MEDIA_READ_DEGRADED); | |
| 219 | /* XXX: set other disk to master, if possible */ | |
| 220 | if (retries < disk->config->max_retries) { | |
| 221 | dmirror_set_bio_retries(bio, retries + 1); | |
| 222 | /* | |
| 223 | * XXX: how do we restore the bio to health? Like this? | |
| 224 | */ | |
| 225 | bio->bio_buf->b_flags &= ~(B_ERROR | B_INVAL); | |
| 226 | /* | |
| 227 | * XXX: something tells me that dispatching stuff from a | |
| 228 | * biodone routine is not the greatest idea | |
| 229 | */ | |
| 230 | dmirror_issue_read(next_disk, bio); | |
| 231 | return; | |
| 232 | } | |
| 233 | } | |
| 234 | ||
| 235 | obio = pop_bio(bio); | |
| 236 | biodone(obio); | |
| 237 | } | |
| 238 | ||
| 239 | void | |
| 240 | dmirror_issue_read(dmirror_disk_t disk, struct bio *bio) | |
| 241 | { | |
| 242 | dmirror_set_bio_disk(bio, disk); | |
| 243 | dmirror_set_bio_segno(bio, SEGNO_FROM_OFFSET(bio->bio_offset)); | |
| 244 | ||
| 245 | bio->bio_done = dmirror_read_done; | |
| 246 | vn_strategy(disk->pdev, bio); | |
| 247 | } | |
| 248 | ||
| 249 | void | |
| 250 | dmirror_read(dm_target_crypt_config_t config, struct bio *bio) | |
| 251 | { | |
| 252 | dmirror_disk_t disk, m_disk; | |
| 253 | segdesc_t segdesc; | |
| 254 | int i, masters = 0; | |
| 255 | ||
| 256 | for(i = 0; i < XXX config->ndisks; i++) { | |
| 257 | disk = &config->disks[i]; | |
| 258 | segdesc = SEGDESC_FROM_OFFSET(disk, bio->bio_offset); | |
| 259 | if (segdesc->flags & MEDIA_MASTER) { | |
| 260 | if (++masters == 1) | |
| 261 | m_disk = disk; | |
| 262 | } | |
| 263 | } | |
| 264 | ||
| 265 | if (masters > 1) { | |
| 266 | /* XXX: fail. */ | |
| 267 | biodone(foo); | |
| 268 | return; | |
| 269 | } | |
| 270 | ||
| 271 | if (masters == 1) { | |
| 272 | segdesc = SEGDESC_FROM_OFFSET(m_disk, bio->bio_offset); | |
| 273 | if (segdesc->flags & UNINITIALIZED) { | |
| 274 | /* XXX: ... */ | |
| 275 | } | |
| 276 | dmirror_issue_read(m_disk, bio); | |
| 277 | } else { | |
| 278 | /* dispatch read to any disk */ | |
| 279 | /* but try not to send to a READ_DEGRADED drive */ | |
| 280 | m_disk = NULL; | |
| 281 | for (i = 0; i < config->ndisks; i++) { | |
| 282 | disk = &config->disks[i]; | |
| 283 | segdesc = SEGDESC_FROM_OFFSET(disk, bio->bio_offset); | |
| 284 | if (!(segdesc->flags & MEDIA_READ_DEGRADED)) { | |
| 285 | m_disk = disk; | |
| 286 | break; | |
| 287 | } | |
| 288 | } | |
| 289 | /* XXX: do the uninitialized magic here, too */ | |
| 290 | if (m_disk) { | |
| 291 | /* | |
| 292 | * XXX: we found some non-degraded disk. We might want to | |
| 293 | * optimize performance by sending reads to different disks, | |
| 294 | * not just the first one. | |
| 295 | */ | |
| 296 | dmirror_set_bio_retries(bio, 0); | |
| 297 | dmirror_issue_read(m_disk, bio); | |
| 298 | } else { | |
| 299 | /* XXX: all disks are read degraded, just sent to any */ | |
| 300 | m_disk = &config->disks[i]; | |
| 301 | dmirror_set_bio_retries(bio, 0); | |
| 302 | dmirror_issue_read(m_disk, bio); | |
| 303 | } | |
| 304 | } | |
| 305 | } | |
| 306 | ||
| 307 | /* Strategy routine called from dm_strategy. */ | |
| 308 | /* | |
| 309 | * Do IO operation, called from dmstrategy routine. | |
| 310 | */ | |
| 311 | int | |
| 312 | dm_target_dmirror_strategy(dm_table_entry_t * table_en, struct buf * bp) | |
| 313 | { | |
| 314 | struct bio *bio, *split_bio1, *split_bio2; | |
| 315 | struct buf *bp; | |
| 316 | off_t bseg, eseg, seg_end; | |
| 317 | size_t fsb; | |
| 318 | int split_transaction = 0; | |
| 319 | ||
| 320 | dm_target_crypt_config_t *priv; | |
| 321 | priv = table_en->target_config; | |
| 322 | ||
| 323 | if ((bp->b_cmd == BUF_CMD_READ) || (bp->b_cmd == BUF_CMD_WRITE)) { | |
| 324 | /* Get rid of stuff we can't really handle */ | |
| 325 | if (((bp->b_bcount % DEV_BSIZE) != 0) || (bp->b_bcount == 0)) { | |
| 326 | kprintf("dm_target_dmirror_strategy: can't really handle bp->b_bcount = %d\n", bp->b_bcount); | |
| 327 | bp->b_error = EINVAL; | |
| 328 | bp->b_flags |= B_ERROR | B_INVAL; | |
| 329 | biodone(&bp->b_bio1); | |
| 330 | return 0; | |
| 331 | } | |
| 332 | ||
| 333 | bseg = SEGNO_FROM_OFFSET(bp->b_bio1.bio_offset); | |
| 334 | eseg = SEGNO_FROM_OFFSET(bp->b_bio1.bio_offset + bp->b_resid); | |
| 335 | seg_end = OFFSET_FROM_SEGNO(eseg); | |
| 336 | ||
| 337 | if (bseg != eseg) { | |
| 338 | split_transaction = 1; | |
| 339 | /* fsb = first segment bytes (bytes in the first segment) */ | |
| 340 | fsb = seg_end - bp->b_bio1.bio_offset; | |
| 341 | ||
| 342 | nestbuf = getpbuf(NULL); | |
| 343 | nestiobuf_setup(&bp->b_bio1, nestbuf, 0, fsb); | |
| 344 | split_bio1 = push_bio(&nestbuf->b_bio1); | |
| 345 | split_bio1->bio_offset = bp->b_bio1.bio_offset + | |
| 346 | priv->block_offset*DEV_BSIZE; | |
| 347 | ||
| 348 | nestbuf = getpbuf(NULL); | |
| 349 | nestiobuf_setup(&bp->b_bio1, nestbuf, fsb, bp->b_resid - fsb); | |
| 350 | split_bio2 = push_bio(&nestbuf->b_bio1); | |
| 351 | split_bio2->bio_offset = bp->b_bio1.bio_offset + fsb + | |
| 352 | priv->block_offset*DEV_BSIZE; | |
| 353 | } | |
| 354 | } | |
| 355 | ||
| 356 | switch (bp->b_cmd) { | |
| 357 | case BUF_CMD_READ: | |
| 358 | if (split_transaction) { | |
| 359 | dmirror_read(priv, split_bio1); | |
| 360 | dmirror_read(priv, split_bio2); | |
| 361 | } else { | |
| 362 | bio = push_bio(&bp->b_bio1); | |
| 363 | bio->bio_offset = bp->b_bio1.bio_offset + priv->block_offset*DEV_BSIZE; | |
| 364 | dmirror_read(priv, bio); | |
| 365 | } | |
| 366 | break; | |
| 367 | ||
| 368 | case BUF_CMD_WRITE: | |
| 369 | if (split_transaction) { | |
| 370 | dmirror_write(priv, split_bio1); | |
| 371 | dmirror_write(priv, split_bio2); | |
| 372 | } else { | |
| 373 | bio = push_bio(&bp->b_bio1); | |
| 374 | bio->bio_offset = bp->b_bio1.bio_offset + priv->block_offset*DEV_BSIZE; | |
| 375 | dmirror_write(priv, bio); | |
| 376 | } | |
| 377 | break; | |
| 378 | ||
| 379 | default: | |
| 380 | /* XXX: clone... */ | |
| 381 | vn_strategy(priv->pdev[0]->pdev_vnode, &bp->b_bio1); | |
| 382 | vn_strategy(priv->pdev[1]->pdev_vnode, &bp->b_bio1); | |
| 383 | } | |
| 384 | ||
| 385 | return 0; | |
| 386 | ||
| 387 | } | |
| 388 | ||
| 389 | /* XXX: add missing dm functions */ |