| 1 | /* |
| 2 | * Copyright (c) 2008 The DragonFly Project. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Matthew Dillon <dillon@backplane.com> |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in |
| 15 | * the documentation and/or other materials provided with the |
| 16 | * distribution. |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 18 | * contributors may be used to endorse or promote products derived |
| 19 | * from this software without specific, prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 32 | * SUCH DAMAGE. |
| 33 | * |
| 34 | * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.19 2008/09/23 21:03:52 dillon Exp $ |
| 35 | */ |
| 36 | |
| 37 | #include "hammer.h" |
| 38 | |
| 39 | /* |
| 40 | * Iterate through the specified range of object ids and remove any |
| 41 | * deleted records that fall entirely within a prune modulo. |
| 42 | * |
| 43 | * A reverse iteration is used to prevent overlapping records from being |
| 44 | * created during the iteration due to alignments. This also allows us |
| 45 | * to adjust alignments without blowing up the B-Tree. |
| 46 | */ |
| 47 | static int prune_should_delete(struct hammer_ioc_prune *prune, |
| 48 | hammer_btree_leaf_elm_t elm); |
| 49 | static void prune_check_nlinks(hammer_cursor_t cursor, |
| 50 | hammer_btree_leaf_elm_t elm); |
| 51 | |
| 52 | int |
| 53 | hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip, |
| 54 | struct hammer_ioc_prune *prune) |
| 55 | { |
| 56 | struct hammer_cursor cursor; |
| 57 | hammer_btree_leaf_elm_t elm; |
| 58 | struct hammer_ioc_prune_elm *copy_elms; |
| 59 | struct hammer_ioc_prune_elm *user_elms; |
| 60 | int error; |
| 61 | int isdir; |
| 62 | int elm_array_size; |
| 63 | int seq; |
| 64 | |
| 65 | if (prune->nelms < 0 || prune->nelms > HAMMER_MAX_PRUNE_ELMS) |
| 66 | return(EINVAL); |
| 67 | if ((prune->key_beg.localization | prune->key_end.localization) & |
| 68 | HAMMER_LOCALIZE_PSEUDOFS_MASK) { |
| 69 | return(EINVAL); |
| 70 | } |
| 71 | if (prune->key_beg.localization > prune->key_end.localization) |
| 72 | return(EINVAL); |
| 73 | if (prune->key_beg.localization == prune->key_end.localization) { |
| 74 | if (prune->key_beg.obj_id > prune->key_end.obj_id) |
| 75 | return(EINVAL); |
| 76 | /* key-space limitations - no check needed */ |
| 77 | } |
| 78 | if ((prune->head.flags & HAMMER_IOC_PRUNE_ALL) && prune->nelms) |
| 79 | return(EINVAL); |
| 80 | |
| 81 | prune->key_cur.localization = (prune->key_end.localization & |
| 82 | HAMMER_LOCALIZE_MASK) + |
| 83 | ip->obj_localization; |
| 84 | prune->key_cur.obj_id = prune->key_end.obj_id; |
| 85 | prune->key_cur.key = HAMMER_MAX_KEY; |
| 86 | |
| 87 | /* |
| 88 | * Copy element array from userland |
| 89 | */ |
| 90 | elm_array_size = sizeof(*copy_elms) * prune->nelms; |
| 91 | user_elms = prune->elms; |
| 92 | copy_elms = kmalloc(elm_array_size, M_TEMP, M_WAITOK); |
| 93 | if ((error = copyin(user_elms, copy_elms, elm_array_size)) != 0) |
| 94 | goto failed; |
| 95 | prune->elms = copy_elms; |
| 96 | |
| 97 | seq = trans->hmp->flusher.act; |
| 98 | |
| 99 | /* |
| 100 | * Scan backwards. Retries typically occur if a deadlock is detected. |
| 101 | */ |
| 102 | retry: |
| 103 | error = hammer_init_cursor(trans, &cursor, NULL, NULL); |
| 104 | if (error) { |
| 105 | hammer_done_cursor(&cursor); |
| 106 | goto failed; |
| 107 | } |
| 108 | cursor.key_beg.localization = (prune->key_beg.localization & |
| 109 | HAMMER_LOCALIZE_MASK) + |
| 110 | ip->obj_localization; |
| 111 | cursor.key_beg.obj_id = prune->key_beg.obj_id; |
| 112 | cursor.key_beg.key = HAMMER_MIN_KEY; |
| 113 | cursor.key_beg.create_tid = 1; |
| 114 | cursor.key_beg.delete_tid = 0; |
| 115 | cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; |
| 116 | cursor.key_beg.obj_type = 0; |
| 117 | |
| 118 | cursor.key_end.localization = prune->key_cur.localization; |
| 119 | cursor.key_end.obj_id = prune->key_cur.obj_id; |
| 120 | cursor.key_end.key = prune->key_cur.key; |
| 121 | cursor.key_end.create_tid = HAMMER_MAX_TID - 1; |
| 122 | cursor.key_end.delete_tid = 0; |
| 123 | cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; |
| 124 | cursor.key_end.obj_type = 0; |
| 125 | |
| 126 | cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; |
| 127 | cursor.flags |= HAMMER_CURSOR_BACKEND; |
| 128 | |
| 129 | /* |
| 130 | * This flag allows the B-Tree code to clean up loose ends. At |
| 131 | * the moment (XXX) it also means we have to hold the sync lock |
| 132 | * through the iteration. |
| 133 | */ |
| 134 | cursor.flags |= HAMMER_CURSOR_PRUNING; |
| 135 | |
| 136 | hammer_sync_lock_sh(trans); |
| 137 | error = hammer_btree_last(&cursor); |
| 138 | hammer_sync_unlock(trans); |
| 139 | |
| 140 | while (error == 0) { |
| 141 | /* |
| 142 | * Check for work |
| 143 | */ |
| 144 | elm = &cursor.node->ondisk->elms[cursor.index].leaf; |
| 145 | prune->key_cur = elm->base; |
| 146 | |
| 147 | /* |
| 148 | * Yield to more important tasks |
| 149 | */ |
| 150 | if ((error = hammer_signal_check(trans->hmp)) != 0) |
| 151 | break; |
| 152 | |
| 153 | if (prune->stat_oldest_tid > elm->base.create_tid) |
| 154 | prune->stat_oldest_tid = elm->base.create_tid; |
| 155 | |
| 156 | if (hammer_debug_general & 0x0200) { |
| 157 | kprintf("check %016llx %016llx cre=%016llx del=%016llx\n", |
| 158 | elm->base.obj_id, |
| 159 | elm->base.key, |
| 160 | elm->base.create_tid, |
| 161 | elm->base.delete_tid); |
| 162 | } |
| 163 | |
| 164 | if (prune_should_delete(prune, elm)) { |
| 165 | if (hammer_debug_general & 0x0200) { |
| 166 | kprintf("check %016llx %016llx: DELETE\n", |
| 167 | elm->base.obj_id, elm->base.key); |
| 168 | } |
| 169 | |
| 170 | /* |
| 171 | * NOTE: This can return EDEADLK |
| 172 | * |
| 173 | * Acquiring the sync lock guarantees that the |
| 174 | * operation will not cross a synchronization |
| 175 | * boundary (see the flusher). |
| 176 | * |
| 177 | * We dont need to track inodes or next_tid when |
| 178 | * we are destroying deleted records. |
| 179 | */ |
| 180 | isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); |
| 181 | |
| 182 | hammer_sync_lock_sh(trans); |
| 183 | error = hammer_delete_at_cursor(&cursor, |
| 184 | HAMMER_DELETE_DESTROY, |
| 185 | cursor.trans->tid, |
| 186 | cursor.trans->time32, |
| 187 | 0, &prune->stat_bytes); |
| 188 | hammer_sync_unlock(trans); |
| 189 | if (error) |
| 190 | break; |
| 191 | |
| 192 | if (isdir) |
| 193 | ++prune->stat_dirrecords; |
| 194 | else |
| 195 | ++prune->stat_rawrecords; |
| 196 | |
| 197 | /* |
| 198 | * The current record might now be the one after |
| 199 | * the one we deleted, set ATEDISK to force us |
| 200 | * to skip it (since we are iterating backwards). |
| 201 | */ |
| 202 | cursor.flags |= HAMMER_CURSOR_ATEDISK; |
| 203 | } else { |
| 204 | /* |
| 205 | * Nothing to delete, but we may have to check other |
| 206 | * things. |
| 207 | */ |
| 208 | prune_check_nlinks(&cursor, elm); |
| 209 | cursor.flags |= HAMMER_CURSOR_ATEDISK; |
| 210 | if (hammer_debug_general & 0x0100) { |
| 211 | kprintf("check %016llx %016llx: SKIP\n", |
| 212 | elm->base.obj_id, elm->base.key); |
| 213 | } |
| 214 | } |
| 215 | ++prune->stat_scanrecords; |
| 216 | |
| 217 | while (hammer_flusher_meta_halflimit(trans->hmp) || |
| 218 | hammer_flusher_undo_exhausted(trans, 2)) { |
| 219 | hammer_unlock_cursor(&cursor); |
| 220 | hammer_flusher_wait(trans->hmp, seq); |
| 221 | hammer_lock_cursor(&cursor); |
| 222 | seq = hammer_flusher_async_one(trans->hmp); |
| 223 | } |
| 224 | hammer_sync_lock_sh(trans); |
| 225 | error = hammer_btree_iterate_reverse(&cursor); |
| 226 | hammer_sync_unlock(trans); |
| 227 | } |
| 228 | if (error == ENOENT) |
| 229 | error = 0; |
| 230 | hammer_done_cursor(&cursor); |
| 231 | if (error == EDEADLK) |
| 232 | goto retry; |
| 233 | if (error == EINTR) { |
| 234 | prune->head.flags |= HAMMER_IOC_HEAD_INTR; |
| 235 | error = 0; |
| 236 | } |
| 237 | failed: |
| 238 | prune->key_cur.localization &= HAMMER_LOCALIZE_MASK; |
| 239 | prune->elms = user_elms; |
| 240 | kfree(copy_elms, M_TEMP); |
| 241 | return(error); |
| 242 | } |
| 243 | |
| 244 | /* |
| 245 | * Check pruning list. The list must be sorted in descending order. |
| 246 | * |
| 247 | * Return non-zero if the record should be deleted. |
| 248 | */ |
| 249 | static int |
| 250 | prune_should_delete(struct hammer_ioc_prune *prune, hammer_btree_leaf_elm_t elm) |
| 251 | { |
| 252 | struct hammer_ioc_prune_elm *scan; |
| 253 | int i; |
| 254 | |
| 255 | /* |
| 256 | * If pruning everything remove all records with a non-zero |
| 257 | * delete_tid. |
| 258 | */ |
| 259 | if (prune->head.flags & HAMMER_IOC_PRUNE_ALL) { |
| 260 | if (elm->base.delete_tid != 0) |
| 261 | return(1); |
| 262 | return(0); |
| 263 | } |
| 264 | |
| 265 | for (i = 0; i < prune->nelms; ++i) { |
| 266 | scan = &prune->elms[i]; |
| 267 | |
| 268 | /* |
| 269 | * Check for loop termination. |
| 270 | */ |
| 271 | if (elm->base.create_tid >= scan->end_tid || |
| 272 | elm->base.delete_tid > scan->end_tid) { |
| 273 | break; |
| 274 | } |
| 275 | |
| 276 | /* |
| 277 | * Determine if we can delete the record. |
| 278 | */ |
| 279 | if (elm->base.delete_tid && |
| 280 | elm->base.create_tid >= scan->beg_tid && |
| 281 | elm->base.delete_tid <= scan->end_tid && |
| 282 | (elm->base.create_tid - scan->beg_tid) / scan->mod_tid == |
| 283 | (elm->base.delete_tid - scan->beg_tid) / scan->mod_tid) { |
| 284 | return(1); |
| 285 | } |
| 286 | } |
| 287 | return(0); |
| 288 | } |
| 289 | |
| 290 | /* |
| 291 | * Dangling inodes can occur if processes are holding open descriptors on |
| 292 | * deleted files as-of when a machine crashes. When we find one simply |
| 293 | * acquire the inode and release it. The inode handling code will then |
| 294 | * do the right thing. |
| 295 | */ |
| 296 | static |
| 297 | void |
| 298 | prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) |
| 299 | { |
| 300 | hammer_inode_t ip; |
| 301 | int error; |
| 302 | |
| 303 | if (elm->base.rec_type != HAMMER_RECTYPE_INODE) |
| 304 | return; |
| 305 | if (elm->base.delete_tid != 0) |
| 306 | return; |
| 307 | if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA)) |
| 308 | return; |
| 309 | if (cursor->data->inode.nlinks) |
| 310 | return; |
| 311 | hammer_cursor_downgrade(cursor); |
| 312 | ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id, |
| 313 | HAMMER_MAX_TID, |
| 314 | elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK, |
| 315 | 0, &error); |
| 316 | if (ip) { |
| 317 | if (hammer_debug_general & 0x0001) { |
| 318 | kprintf("pruning disconnected inode %016llx\n", |
| 319 | elm->base.obj_id); |
| 320 | } |
| 321 | hammer_rel_inode(ip, 0); |
| 322 | hammer_inode_waitreclaims(cursor->trans->hmp); |
| 323 | } else { |
| 324 | kprintf("unable to prune disconnected inode %016llx\n", |
| 325 | elm->base.obj_id); |
| 326 | } |
| 327 | } |
| 328 | |
| 329 | #if 0 |
| 330 | |
| 331 | /* |
| 332 | * NOTE: THIS CODE HAS BEEN REMOVED! Pruning no longer attempts to realign |
| 333 | * adjacent records because it seriously interferes with every |
| 334 | * mirroring algorithm I could come up with. |
| 335 | * |
| 336 | * This means that historical accesses beyond the first snapshot |
| 337 | * softlink should be on snapshot boundaries only. Historical |
| 338 | * accesses from "now" to the first snapshot softlink continue to |
| 339 | * be fine-grained. |
| 340 | * |
| 341 | * NOTE: It also looks like there's a bug in the removed code. It is believed |
| 342 | * that create_tid can sometimes get set to 0xffffffffffffffff. Just as |
| 343 | * well we no longer try to do this fancy shit. Probably the attempt to |
| 344 | * correct the rhb is blowing up the cursor's indexing or addressing mapping. |
| 345 | * |
| 346 | * Align the record to cover any gaps created through the deletion of |
| 347 | * records within the pruning space. If we were to just delete the records |
| 348 | * there would be gaps which in turn would cause a snapshot that is NOT on |
| 349 | * a pruning boundary to appear corrupt to the user. Forcing alignment |
| 350 | * of the create_tid and delete_tid for retained records 'reconnects' |
| 351 | * the previously contiguous space, making it contiguous again after the |
| 352 | * deletions. |
| 353 | * |
| 354 | * The use of a reverse iteration allows us to safely align the records and |
| 355 | * related elements without creating temporary overlaps. XXX we should |
| 356 | * add ordering dependancies for record buffers to guarantee consistency |
| 357 | * during recovery. |
| 358 | */ |
| 359 | static int |
| 360 | realign_prune(struct hammer_ioc_prune *prune, |
| 361 | hammer_cursor_t cursor, int realign_cre, int realign_del) |
| 362 | { |
| 363 | struct hammer_ioc_prune_elm *scan; |
| 364 | hammer_btree_elm_t elm; |
| 365 | hammer_tid_t delta; |
| 366 | hammer_tid_t tid; |
| 367 | int error; |
| 368 | |
| 369 | hammer_cursor_downgrade(cursor); |
| 370 | |
| 371 | elm = &cursor->node->ondisk->elms[cursor->index]; |
| 372 | ++prune->stat_realignments; |
| 373 | |
| 374 | /* |
| 375 | * Align the create_tid. By doing a reverse iteration we guarantee |
| 376 | * that all records after our current record have already been |
| 377 | * aligned, allowing us to safely correct the right-hand-boundary |
| 378 | * (because no record to our right is otherwise exactly matching |
| 379 | * will have a create_tid to the left of our aligned create_tid). |
| 380 | */ |
| 381 | error = 0; |
| 382 | if (realign_cre >= 0) { |
| 383 | scan = &prune->elms[realign_cre]; |
| 384 | |
| 385 | delta = (elm->leaf.base.create_tid - scan->beg_tid) % |
| 386 | scan->mod_tid; |
| 387 | if (delta) { |
| 388 | tid = elm->leaf.base.create_tid - delta + scan->mod_tid; |
| 389 | |
| 390 | /* can EDEADLK */ |
| 391 | error = hammer_btree_correct_rhb(cursor, tid + 1); |
| 392 | if (error == 0) { |
| 393 | error = hammer_btree_extract(cursor, |
| 394 | HAMMER_CURSOR_GET_LEAF); |
| 395 | } |
| 396 | if (error == 0) { |
| 397 | /* can EDEADLK */ |
| 398 | error = hammer_cursor_upgrade(cursor); |
| 399 | } |
| 400 | if (error == 0) { |
| 401 | hammer_modify_node(cursor->trans, cursor->node, |
| 402 | &elm->leaf.base.create_tid, |
| 403 | sizeof(elm->leaf.base.create_tid)); |
| 404 | elm->leaf.base.create_tid = tid; |
| 405 | hammer_modify_node_done(cursor->node); |
| 406 | } |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | /* |
| 411 | * Align the delete_tid. This only occurs if the record is historical |
| 412 | * was deleted at some point. Realigning the delete_tid does not |
| 413 | * move the record within the B-Tree but may cause it to temporarily |
| 414 | * overlap a record that has not yet been pruned. |
| 415 | */ |
| 416 | if (error == 0 && realign_del >= 0) { |
| 417 | scan = &prune->elms[realign_del]; |
| 418 | |
| 419 | delta = (elm->leaf.base.delete_tid - scan->beg_tid) % |
| 420 | scan->mod_tid; |
| 421 | if (delta) { |
| 422 | error = hammer_btree_extract(cursor, |
| 423 | HAMMER_CURSOR_GET_LEAF); |
| 424 | if (error == 0) { |
| 425 | hammer_modify_node(cursor->trans, cursor->node, |
| 426 | &elm->leaf.base.delete_tid, |
| 427 | sizeof(elm->leaf.base.delete_tid)); |
| 428 | elm->leaf.base.delete_tid = |
| 429 | elm->leaf.base.delete_tid - |
| 430 | delta + scan->mod_tid; |
| 431 | hammer_modify_node_done(cursor->node); |
| 432 | } |
| 433 | } |
| 434 | } |
| 435 | return (error); |
| 436 | } |
| 437 | |
| 438 | #endif |