acxcontrol(8) is gone.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.28 2008/02/05 07:58:43 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid == 0) {
61                 if (rec2->rec.base.base.create_tid == 0)
62                         return(0);
63                 return(1);
64         }
65         if (rec2->rec.base.base.create_tid == 0)
66                 return(-1);
67
68         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
69                 return(-1);
70         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
71                 return(1);
72         return(0);
73 }
74
75 static int
76 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
77 {
78         if (info->rec_type < rec->rec.base.base.rec_type)
79                 return(-3);
80         if (info->rec_type > rec->rec.base.base.rec_type)
81                 return(3);
82
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         if (info->create_tid == 0) {
89                 if (rec->rec.base.base.create_tid == 0)
90                         return(0);
91                 return(1);
92         }
93         if (rec->rec.base.base.create_tid == 0)
94                 return(-1);
95         if (info->create_tid < rec->rec.base.base.create_tid)
96                 return(-1);
97         if (info->create_tid > rec->rec.base.base.create_tid)
98                 return(1);
99         return(0);
100 }
101
102 /*
103  * RB_SCAN comparison code for hammer_mem_first().  The argument order
104  * is reversed so the comparison result has to be negated.  key_beg and
105  * key_end are both range-inclusive.
106  *
107  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
108  * These do not stop the scan.
109  *
110  * Localized deletions are not cached in-memory.
111  */
112 static
113 int
114 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
115 {
116         hammer_cursor_t cursor = data;
117         int r;
118
119         r = hammer_rec_compare(&cursor->key_beg, rec);
120         if (r > 1)
121                 return(-1);
122         r = hammer_rec_compare(&cursor->key_end, rec);
123         if (r < -1)
124                 return(1);
125         return(0);
126 }
127
128 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
130                     hammer_rec_compare, hammer_base_elm_t);
131
132 /*
133  * Allocate a record for the caller to finish filling in.  The record is
134  * returned referenced.
135  */
136 hammer_record_t
137 hammer_alloc_mem_record(hammer_inode_t ip)
138 {
139         hammer_record_t record;
140
141         ++hammer_count_records;
142         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
143         record->ip = ip;
144         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
145         hammer_ref(&record->lock);
146         return (record);
147 }
148
149 /*
150  * Release a memory record.  Records marked for deletion are immediately
151  * removed from the RB-Tree but otherwise left intact until the last ref
152  * goes away.
153  */
154 void
155 hammer_rel_mem_record(struct hammer_record *record)
156 {
157         hammer_unref(&record->lock);
158
159         if (record->flags & HAMMER_RECF_DELETED) {
160                 if (record->flags & HAMMER_RECF_ONRBTREE) {
161                         RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
162                                   record);
163                         record->flags &= ~HAMMER_RECF_ONRBTREE;
164                 }
165                 if (record->lock.refs == 0) {
166                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
167                                 --hammer_count_record_datas;
168                                 kfree(record->data, M_HAMMER);
169                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
170                         }
171                         record->data = NULL;
172                         --hammer_count_records;
173                         kfree(record, M_HAMMER);
174                         return;
175                 }
176         }
177
178         /*
179          * If someone wanted the record wake them up.
180          */
181         if (record->flags & HAMMER_RECF_WANTED) {
182                 record->flags &= ~HAMMER_RECF_WANTED;
183                 wakeup(record);
184         }
185 }
186
187 /*
188  * Lookup an in-memory record given the key specified in the cursor.  Works
189  * just like hammer_btree_lookup() but operates on an inode's in-memory
190  * record list.
191  *
192  * The lookup must fail if the record is marked for deferred deletion.
193  */
194 static
195 int
196 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
197 {
198         int error;
199
200         if (cursor->iprec) {
201                 hammer_rel_mem_record(cursor->iprec);
202                 cursor->iprec = NULL;
203         }
204         if (cursor->ip) {
205                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
206                                                   &cursor->ip->rec_tree);
207         }
208         cursor->ip = ip;
209         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
210         cursor->scan.node = NULL;
211         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
212                                 &ip->rec_tree, &cursor->key_beg);
213         if (cursor->iprec == NULL) {
214                 error = ENOENT;
215         } else {
216                 hammer_ref(&cursor->iprec->lock);
217                 error = 0;
218         }
219         return(error);
220 }
221
222 /*
223  * hammer_mem_first() - locate the first in-memory record matching the
224  * cursor.
225  *
226  * The RB_SCAN function we use is designed as a callback.  We terminate it
227  * (return -1) as soon as we get a match.
228  */
229 static
230 int
231 hammer_rec_scan_callback(hammer_record_t rec, void *data)
232 {
233         hammer_cursor_t cursor = data;
234
235         /*
236          * We terminate on success, so this should be NULL on entry.
237          */
238         KKASSERT(cursor->iprec == NULL);
239
240         /*
241          * Skip if the record was marked deleted
242          */
243         if (rec->flags & HAMMER_RECF_DELETED)
244                 return(0);
245
246         /*
247          * Skip if not visible due to our as-of TID
248          */
249         if (cursor->flags & HAMMER_CURSOR_ASOF) {
250                 if (cursor->asof < rec->rec.base.base.create_tid)
251                         return(0);
252                 if (rec->rec.base.base.delete_tid &&
253                     cursor->asof >= rec->rec.base.base.delete_tid) {
254                         return(0);
255                 }
256         }
257
258         /*
259          * Block if currently being synchronized to disk, otherwise we
260          * may get a duplicate.  Wakeup the syncer if it's stuck on
261          * the record.
262          */
263         hammer_ref(&rec->lock);
264         ++rec->blocked;
265         while (rec->flags & HAMMER_RECF_SYNCING) {
266                 rec->flags |= HAMMER_RECF_WANTED;
267                 tsleep(rec, 0, "hmrrc2", 0);
268         }
269         --rec->blocked;
270
271         /*
272          * The record may have been deleted while we were blocked.
273          */
274         if (rec->flags & HAMMER_RECF_DELETED) {
275                 hammer_rel_mem_record(cursor->iprec);
276                 return(0);
277         }
278
279         /*
280          * Set the matching record and stop the scan.
281          */
282         cursor->iprec = rec;
283         return(-1);
284 }
285
286 static
287 int
288 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
289 {
290         if (cursor->iprec) {
291                 hammer_rel_mem_record(cursor->iprec);
292                 cursor->iprec = NULL;
293         }
294         if (cursor->ip) {
295                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
296                                                   &cursor->ip->rec_tree);
297         }
298         cursor->ip = ip;
299         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
300
301         cursor->scan.node = NULL;
302         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
303                                    hammer_rec_scan_callback, cursor);
304
305         /*
306          * Adjust scan.node and keep it linked into the RB-tree so we can
307          * hold the cursor through third party modifications of the RB-tree.
308          */
309         if (cursor->iprec) {
310                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
311                 return(0);
312         }
313         return(ENOENT);
314 }
315
316 void
317 hammer_mem_done(hammer_cursor_t cursor)
318 {
319         if (cursor->ip) {
320                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
321                                                   &cursor->ip->rec_tree);
322                 cursor->ip = NULL;
323         }
324         if (cursor->iprec) {
325                 hammer_rel_mem_record(cursor->iprec);
326                 cursor->iprec = NULL;
327         }
328 }
329
330 /************************************************************************
331  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
332  ************************************************************************
333  *
334  * These functions manipulate in-memory records.  Such records typically
335  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
336  */
337
338 /*
339  * Add a directory entry (dip,ncp) which references inode (ip).
340  *
341  * Note that the low 32 bits of the namekey are set temporarily to create
342  * a unique in-memory record, and may be modified a second time when the
343  * record is synchronized to disk.  In particular, the low 32 bits cannot be
344  * all 0's when synching to disk, which is not handled here.
345  */
346 int
347 hammer_ip_add_directory(struct hammer_transaction *trans,
348                      struct hammer_inode *dip, struct namecache *ncp,
349                      struct hammer_inode *ip)
350 {
351         hammer_record_t record;
352         int error;
353         int bytes;
354
355         record = hammer_alloc_mem_record(dip);
356
357         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
358         if (++trans->hmp->namekey_iterator == 0)
359                 ++trans->hmp->namekey_iterator;
360
361         record->rec.entry.base.base.obj_id = dip->obj_id;
362         record->rec.entry.base.base.key =
363                 hammer_directory_namekey(ncp->nc_name, bytes);
364         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
365         record->rec.entry.base.base.create_tid = trans->tid;
366         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
367         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
368         record->rec.entry.obj_id = ip->obj_id;
369         if (bytes <= sizeof(record->rec.entry.den_name)) {
370                 record->data = (void *)record->rec.entry.den_name;
371                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
372         } else {
373                 ++hammer_count_record_datas;
374                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
375                 record->flags |= HAMMER_RECF_ALLOCDATA;
376         }
377         bcopy(ncp->nc_name, record->data, bytes);
378         record->rec.entry.base.data_len = bytes;
379         ++ip->ino_rec.ino_nlinks;
380         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
381         error = hammer_mem_add(trans, record);
382         return(error);
383 }
384
385 /*
386  * Delete the directory entry and update the inode link count.  The
387  * cursor must be seeked to the directory entry record being deleted.
388  *
389  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
390  *
391  * This function can return EDEADLK requiring the caller to terminate
392  * the cursor and retry.
393  */
394 int
395 hammer_ip_del_directory(struct hammer_transaction *trans,
396                      hammer_cursor_t cursor, struct hammer_inode *dip,
397                      struct hammer_inode *ip)
398 {
399         int error;
400
401         error = hammer_ip_delete_record(cursor, trans->tid);
402
403         /*
404          * One less link.  The file may still be open in the OS even after
405          * all links have gone away so we only try to sync if the OS has
406          * no references and nlinks falls to 0.
407          *
408          * We have to terminate the cursor before syncing the inode to
409          * avoid deadlocking against ourselves.
410          */
411         if (error == 0) {
412                 --ip->ino_rec.ino_nlinks;
413                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
414                 if (ip->ino_rec.ino_nlinks == 0 &&
415                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
416                         hammer_done_cursor(cursor);
417                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
418                 }
419
420         }
421         return(error);
422 }
423
424 /*
425  * Add a record to an inode.
426  *
427  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
428  * initialize the following additional fields:
429  *
430  * record->rec.entry.base.base.key
431  * record->rec.entry.base.base.rec_type
432  * record->rec.entry.base.base.data_len
433  * record->data         (a copy will be kmalloc'd if not embedded)
434  */
435 int
436 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
437 {
438         hammer_inode_t ip = record->ip;
439         int error;
440         int bytes;
441         void *data;
442
443         record->rec.base.base.obj_id = ip->obj_id;
444         record->rec.base.base.create_tid = trans->tid;
445         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
446         bytes = record->rec.base.data_len;
447
448         if (record->data) {
449                 if ((char *)record->data < (char *)&record->rec ||
450                     (char *)record->data >= (char *)(&record->rec + 1)) {
451                         ++hammer_count_record_datas;
452                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
453                         record->flags |= HAMMER_RECF_ALLOCDATA;
454                         bcopy(record->data, data, bytes);
455                         record->data = data;
456                 } else {
457                         record->flags |= HAMMER_RECF_EMBEDDED_DATA;
458                 }
459         }
460         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
461         error = hammer_mem_add(trans, record);
462         return(error);
463 }
464
465 /*
466  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
467  * is called via the strategy called from a cached data source.  This code
468  * is responsible for actually writing a data record out to the disk.
469  *
470  * This can only occur non-historically (i.e. 'current' data only).
471  */
472 int
473 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
474                        int64_t offset, void *data, int bytes,
475                        struct hammer_cursor **spike)
476 {
477         struct hammer_cursor cursor;
478         hammer_record_ondisk_t rec;
479         union hammer_btree_elm elm;
480         void *bdata;
481         int error;
482
483         KKASSERT((offset & HAMMER_BUFMASK) == 0);
484         KKASSERT((bytes & HAMMER_BUFMASK) == 0);
485 retry:
486         error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
487         if (error)
488                 return(error);
489         cursor.key_beg.obj_id = ip->obj_id;
490         cursor.key_beg.key = offset + bytes;
491         cursor.key_beg.create_tid = trans->tid;
492         cursor.key_beg.delete_tid = 0;
493         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
494         cursor.asof = trans->tid;
495         cursor.flags |= HAMMER_CURSOR_INSERT;
496
497         /*
498          * Issue a lookup to position the cursor and locate the cluster
499          */
500         error = hammer_btree_lookup(&cursor);
501         if (error == 0) {
502                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
503                         offset, bytes);
504                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
505                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
506                 error = EIO;
507         }
508         if (error != ENOENT)
509                 goto done;
510
511         /*
512          * Allocate record and data space now that we know which cluster
513          * the B-Tree node ended up in.
514          */
515         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
516                                   &cursor.data_buffer);
517         if (bdata == NULL)
518                 goto done;
519         rec = hammer_alloc_record(cursor.node->cluster, &error,
520                                   HAMMER_RECTYPE_DATA, &cursor.record_buffer);
521         if (rec == NULL)
522                 goto fail1;
523
524         /*
525          * Fill everything in and insert our B-Tree node.
526          */
527         hammer_modify_buffer(cursor.record_buffer);
528         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
529         rec->base.base.obj_id = ip->obj_id;
530         rec->base.base.key = offset + bytes;
531         rec->base.base.create_tid = trans->tid;
532         rec->base.base.delete_tid = 0;
533         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
534         rec->base.data_crc = crc32(data, bytes);
535         rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster);
536         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
537         rec->base.data_len = bytes;
538
539         hammer_modify_buffer(cursor.data_buffer);
540         bcopy(data, bdata, bytes);
541
542         elm.leaf.base = rec->base.base;
543         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
544         elm.leaf.data_offset = rec->base.data_offset;
545         elm.leaf.data_len = bytes;
546         elm.leaf.data_crc = rec->base.data_crc;
547
548         /*
549          * Data records can wind up on-disk before the inode itself is
550          * on-disk.  One must assume data records may be on-disk if either
551          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
552          */
553         ip->flags |= HAMMER_INODE_DONDISK;
554
555         error = hammer_btree_insert(&cursor, &elm);
556         if (error == 0)
557                 goto done;
558
559         hammer_free_record_ptr(cursor.record_buffer, rec, HAMMER_RECTYPE_DATA);
560 fail1:
561         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
562 done:
563         /*
564          * If ENOSPC in cluster fill in the spike structure and return
565          * ENOSPC.
566          */
567         if (error == ENOSPC)
568                 hammer_load_spike(&cursor, spike);
569         hammer_done_cursor(&cursor);
570         if (error == EDEADLK)
571                 goto retry;
572         return(error);
573 }
574
575 /*
576  * Sync an in-memory record to the disk.  this is typically called via fsync
577  * from a cached record source.  This code is responsible for actually
578  * writing a record out to the disk.
579  */
580 int
581 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
582 {
583         struct hammer_cursor cursor;
584         hammer_record_ondisk_t rec;
585         hammer_mount_t hmp;
586         union hammer_btree_elm elm;
587         void *bdata;
588         int error;
589
590 retry:
591         /*
592          * If the record has been deleted or is being synchronized, stop.
593          * Interlock with the syncing flag.
594          */
595         if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING))
596                 return(0);
597         record->flags |= HAMMER_RECF_SYNCING;
598
599         /*
600          * If someone other then us is referencing the record and not
601          * blocking waiting for us, we have to wait until they finish.
602          *
603          * It is possible the record got destroyed while we were blocked.
604          */
605         if (record->lock.refs > record->blocked + 1) {
606                 while (record->lock.refs > record->blocked + 1) {
607                         record->flags |= HAMMER_RECF_WANTED;
608                         tsleep(record, 0, "hmrrc1", 0);
609                 }
610                 if (record->flags & HAMMER_RECF_DELETED)
611                         return(0);
612         }
613
614         /*
615          * Get a cursor
616          */
617         error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
618                                        record->ip->hmp);
619         if (error)
620                 return(error);
621         cursor.key_beg = record->rec.base.base;
622         cursor.flags |= HAMMER_CURSOR_INSERT;
623
624         /*
625          * Issue a lookup to position the cursor and locate the cluster.  The
626          * target key should not exist.  If we are creating a directory entry
627          * we may have to iterate the low 32 bits of the key to find an unused
628          * key.
629          */
630         for (;;) {
631                 error = hammer_btree_lookup(&cursor);
632                 if (error)
633                         break;
634                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
635                         kprintf("hammer_ip_sync_record: duplicate rec "
636                                 "at (%016llx)\n", record->rec.base.base.key);
637                         Debugger("duplicate record1");
638                         error = EIO;
639                         break;
640                 }
641                 hmp = cursor.node->cluster->volume->hmp;
642                 if (++hmp->namekey_iterator == 0)
643                         ++hmp->namekey_iterator;
644                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
645                 record->rec.base.base.key |= hmp->namekey_iterator;
646                 cursor.key_beg.key = record->rec.base.base.key;
647         }
648         if (error != ENOENT)
649                 goto done;
650
651         /*
652          * Mark the record as undergoing synchronization.  Our cursor is
653          * holding a locked B-Tree node for the insertion which interlocks
654          * anyone trying to access this record.
655          *
656          * XXX There is still a race present related to iterations.  An
657          * iteration may process the record, a sync may occur, and then
658          * later process the B-Tree element for the same record.
659          *
660          * We do not try to synchronize a deleted record.
661          */
662         if (record->flags & HAMMER_RECF_DELETED) {
663                 error = 0;
664                 goto done;
665         }
666
667         /*
668          * Allocate record and data space now that we know which cluster
669          * the B-Tree node ended up in.
670          */
671         if (record->data == NULL ||
672             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
673                 bdata = record->data;
674         } else {
675                 bdata = hammer_alloc_data(cursor.node->cluster,
676                                           record->rec.base.data_len, &error,
677                                           &cursor.data_buffer);
678                 if (bdata == NULL)
679                         goto done;
680         }
681         rec = hammer_alloc_record(cursor.node->cluster, &error,
682                                   record->rec.base.base.rec_type,
683                                   &cursor.record_buffer);
684         if (rec == NULL)
685                 goto fail1;
686
687         /*
688          * Fill everything in and insert our B-Tree node.
689          */
690         hammer_modify_buffer(cursor.record_buffer);
691         *rec = record->rec;
692         if (bdata) {
693                 rec->base.data_crc = crc32(record->data,
694                                            record->rec.base.data_len);
695                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
696                         /*
697                          * Data embedded in record
698                          */
699                         rec->base.data_offset = ((char *)bdata -
700                                                  (char *)&record->rec);
701                         KKASSERT(rec->base.data_offset >= 0 && 
702                                  rec->base.data_offset + rec->base.data_len <=
703                                   sizeof(*rec));
704                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
705                 } else {
706                         /*
707                          * Data separate from record
708                          */
709                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
710                         hammer_modify_buffer(cursor.data_buffer);
711                         bcopy(record->data, bdata, rec->base.data_len);
712                 }
713         }
714         rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster);
715
716         elm.leaf.base = record->rec.base.base;
717         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
718         elm.leaf.data_offset = rec->base.data_offset;
719         elm.leaf.data_len = rec->base.data_len;
720         elm.leaf.data_crc = rec->base.data_crc;
721
722         error = hammer_btree_insert(&cursor, &elm);
723
724         /*
725          * Clean up on success, or fall through on error.
726          */
727         if (error == 0) {
728                 record->flags |= HAMMER_RECF_DELETED;
729                 goto done;
730         }
731
732         hammer_free_record_ptr(cursor.record_buffer, rec,
733                                record->rec.base.base.rec_type);
734 fail1:
735         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
736                 hammer_free_data_ptr(cursor.data_buffer, bdata,
737                                      record->rec.base.data_len);
738         }
739 done:
740         record->flags &= ~HAMMER_RECF_SYNCING;
741         /*
742          * If ENOSPC in cluster fill in the spike structure and return
743          * ENOSPC.
744          */
745         if (error == ENOSPC)
746                 hammer_load_spike(&cursor, spike);
747         hammer_done_cursor(&cursor);
748         if (error == EDEADLK)
749                 goto retry;
750         return(error);
751 }
752
753 /*
754  * Write out a record using the specified cursor.  The caller does not have
755  * to seek the cursor.  The flags are used to determine whether the data
756  * (if any) is embedded in the record or not.
757  *
758  * The cursor must be initialized, including its key_beg element.  The B-Tree
759  * key may be different from the base element in the record (e.g. for spikes).
760  *
761  * NOTE: This can return EDEADLK, requiring the caller to release its cursor
762  * and retry the operation.
763  */
764 int
765 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
766                     void *data, int cursor_flags)
767 {
768         union hammer_btree_elm elm;
769         hammer_record_ondisk_t nrec;
770         hammer_cluster_t ncluster;
771         hammer_volume_t nvolume;
772         int32_t nrec_offset;
773         void *bdata;
774         int error;
775
776         KKASSERT(cursor->flags & HAMMER_CURSOR_INSERT);
777
778         /*
779          * Issue a lookup to position the cursor and locate the cluster.  The
780          * target key should not exist.
781          *
782          * If we run out of space trying to adjust the B-Tree for the
783          * insert, re-lookup without the insert flag so the cursor
784          * is properly positioned for the spike.
785          */
786         error = hammer_btree_lookup(cursor);
787         if (error == 0) {
788                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
789                         orec->base.base.key);
790                 Debugger("duplicate record2");
791                 error = EIO;
792         }
793         if (error != ENOENT)
794                 goto done;
795
796         /*
797          * Allocate record and data space now that we know which cluster
798          * the B-Tree node ended up in.
799          */
800         if (data == NULL ||
801             (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
802                 bdata = data;
803         } else {
804                 bdata = hammer_alloc_data(cursor->node->cluster,
805                                           orec->base.data_len, &error,
806                                           &cursor->data_buffer);
807                 if (bdata == NULL)
808                         goto done;
809         }
810         nrec = hammer_alloc_record(cursor->node->cluster, &error,
811                                   orec->base.base.rec_type,
812                                   &cursor->record_buffer);
813         if (nrec == NULL)
814                 goto fail1;
815
816         /*
817          * Fill everything in and insert our B-Tree node.
818          */
819         hammer_modify_buffer(cursor->record_buffer);
820         *nrec = *orec;
821         nrec->base.data_offset = 0;
822         if (bdata) {
823                 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
824                 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
825                         /*
826                          * Data embedded in record
827                          */
828                         nrec->base.data_offset = ((char *)bdata - (char *)orec);
829                         KKASSERT(nrec->base.data_offset >= 0 && 
830                                  nrec->base.data_offset + nrec->base.data_len <
831                                   sizeof(*nrec));
832                         nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
833                 } else {
834                         /*
835                          * Data separate from record
836                          */
837                         nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
838                         hammer_modify_buffer(cursor->data_buffer);
839                         bcopy(data, bdata, nrec->base.data_len);
840                 }
841         }
842         nrec->base.rec_id = hammer_alloc_recid(cursor->node->cluster);
843         nrec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
844
845         switch(cursor->key_beg.btype) {
846         case HAMMER_BTREE_TYPE_RECORD:
847                 elm.leaf.base = cursor->key_beg;
848                 elm.leaf.rec_offset = nrec_offset;
849                 elm.leaf.data_offset = nrec->base.data_offset;
850                 elm.leaf.data_len = nrec->base.data_len;
851                 elm.leaf.data_crc = nrec->base.data_crc;
852                 error = hammer_btree_insert(cursor, &elm);
853                 break;
854         case HAMMER_BTREE_TYPE_SPIKE_BEG:
855 #if 0
856                 Debugger("SpikeSpike");
857 #endif
858                 nvolume = hammer_get_volume(cursor->node->cluster->volume->hmp,
859                                             nrec->spike.vol_no, &error);
860                 if (error)
861                         break;
862                 ncluster = hammer_get_cluster(nvolume, nrec->spike.clu_no,
863                                               &error, GET_CLUSTER_NORECOVER);
864                 hammer_rel_volume(nvolume, 0);
865                 if (error)
866                         break;
867                 hammer_modify_cluster(ncluster);
868                 ncluster->ondisk->clu_btree_parent_offset =
869                         cursor->node->node_offset;
870                 hammer_rel_cluster(ncluster, 0);
871                 error = hammer_btree_insert_cluster(cursor, ncluster,
872                                                     nrec_offset);
873                 break;
874         case HAMMER_BTREE_TYPE_SPIKE_END:
875                 panic("hammer_write_record: cannot write spike-end elms");
876                 break;
877         default:
878                 panic("hammer_write_record: unknown btype %02x",
879                       elm.leaf.base.btype);
880                 break;
881         }
882         if (error == 0)
883                 goto done;
884
885         hammer_free_record_ptr(cursor->record_buffer, nrec,
886                                orec->base.base.rec_type);
887 fail1:
888         if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
889                 hammer_free_data_ptr(cursor->data_buffer, bdata,
890                                      orec->base.data_len);
891         }
892 done:
893         /* leave cursor intact */
894         return(error);
895 }
896
897 /*
898  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
899  * entry's key is used to deal with hash collisions in the upper 32 bits.
900  * A unique 64 bit key is generated in-memory and may be regenerated a
901  * second time when the directory record is flushed to the on-disk B-Tree.
902  *
903  * A referenced record is passed to this function.  This function
904  * eats the reference.  If an error occurs the record will be deleted.
905  */
906 static
907 int
908 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
909 {
910         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
911                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
912                         record->flags |= HAMMER_RECF_DELETED;
913                         hammer_rel_mem_record(record);
914                         return (EEXIST);
915                 }
916                 if (++trans->hmp->namekey_iterator == 0)
917                         ++trans->hmp->namekey_iterator;
918                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
919                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
920         }
921         record->flags |= HAMMER_RECF_ONRBTREE;
922         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
923         hammer_rel_mem_record(record);
924         return(0);
925 }
926
927 /************************************************************************
928  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
929  ************************************************************************
930  *
931  * These functions augment the B-Tree scanning functions in hammer_btree.c
932  * by merging in-memory records with on-disk records.
933  */
934
935 /*
936  * Locate a particular record either in-memory or on-disk.
937  *
938  * NOTE: This is basically a standalone routine, hammer_ip_next() may
939  * NOT be called to iterate results.
940  */
941 int
942 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
943 {
944         int error;
945
946         /*
947          * If the element is in-memory return it without searching the
948          * on-disk B-Tree
949          */
950         error = hammer_mem_lookup(cursor, ip);
951         if (error == 0) {
952                 cursor->record = &cursor->iprec->rec;
953                 return(error);
954         }
955         if (error != ENOENT)
956                 return(error);
957
958         /*
959          * If the inode has on-disk components search the on-disk B-Tree.
960          */
961         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
962                 return(error);
963         error = hammer_btree_lookup(cursor);
964         if (error == 0)
965                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
966         return(error);
967 }
968
969 /*
970  * Locate the first record within the cursor's key_beg/key_end range,
971  * restricted to a particular inode.  0 is returned on success, ENOENT
972  * if no records matched the requested range, or some other error.
973  *
974  * When 0 is returned hammer_ip_next() may be used to iterate additional
975  * records within the requested range.
976  *
977  * This function can return EDEADLK, requiring the caller to terminate
978  * the cursor and try again.
979  */
980 int
981 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
982 {
983         int error;
984
985         /*
986          * Clean up fields and setup for merged scan
987          */
988         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
989         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
990         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
991         if (cursor->iprec) {
992                 hammer_rel_mem_record(cursor->iprec);
993                 cursor->iprec = NULL;
994         }
995
996         /*
997          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
998          * exact lookup so if we get ENOENT we have to call the iterate
999          * function to validate the first record after the begin key.
1000          *
1001          * The ATEDISK flag is used by hammer_btree_iterate to determine
1002          * whether it must index forwards or not.  It is also used here
1003          * to select the next record from in-memory or on-disk.
1004          *
1005          * EDEADLK can only occur if the lookup hit an empty internal
1006          * element and couldn't delete it.  Since this could only occur
1007          * in-range, we can just iterate from the failure point.
1008          */
1009         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1010                 error = hammer_btree_lookup(cursor);
1011                 if (error == ENOENT || error == EDEADLK) {
1012                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1013                         error = hammer_btree_iterate(cursor);
1014                 }
1015                 if (error && error != ENOENT) 
1016                         return(error);
1017                 if (error == 0) {
1018                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1019                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1020                 } else {
1021                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1022                 }
1023         }
1024
1025         /*
1026          * Search the in-memory record list (Red-Black tree).  Unlike the
1027          * B-Tree search, mem_first checks for records in the range.
1028          */
1029         error = hammer_mem_first(cursor, ip);
1030         if (error && error != ENOENT)
1031                 return(error);
1032         if (error == 0) {
1033                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1034                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1035         }
1036
1037         /*
1038          * This will return the first matching record.
1039          */
1040         return(hammer_ip_next(cursor));
1041 }
1042
1043 /*
1044  * Retrieve the next record in a merged iteration within the bounds of the
1045  * cursor.  This call may be made multiple times after the cursor has been
1046  * initially searched with hammer_ip_first().
1047  *
1048  * 0 is returned on success, ENOENT if no further records match the
1049  * requested range, or some other error code is returned.
1050  */
1051 int
1052 hammer_ip_next(hammer_cursor_t cursor)
1053 {
1054         hammer_btree_elm_t elm;
1055         hammer_record_t rec;
1056         int error;
1057         int r;
1058
1059         /*
1060          * Load the current on-disk and in-memory record.  If we ate any
1061          * records we have to get the next one. 
1062          *
1063          * If we deleted the last on-disk record we had scanned ATEDISK will
1064          * be clear and DELBTREE will be set, forcing a call to iterate. The
1065          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1066          * element.  If ATEDISK is set, iterate will skip the 'current'
1067          * element.
1068          *
1069          * Get the next on-disk record
1070          */
1071         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1072                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1073                         error = hammer_btree_iterate(cursor);
1074                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1075                         if (error == 0)
1076                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1077                         else
1078                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1079                                                  HAMMER_CURSOR_ATEDISK;
1080                 }
1081         }
1082
1083         /*
1084          * Get the next in-memory record.  The record can be ripped out
1085          * of the RB tree so we maintain a scan_info structure to track
1086          * the next node.
1087          *
1088          * hammer_rec_scan_cmp:  Is the record still in our general range,
1089          *                       (non-inclusive of snapshot exclusions)?
1090          * hammer_rec_scan_callback: Is the record in our snapshot?
1091          */
1092         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1093                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1094                         if (cursor->iprec) {
1095                                 hammer_rel_mem_record(cursor->iprec);
1096                                 cursor->iprec = NULL;
1097                         }
1098                         rec = cursor->scan.node;        /* next node */
1099                         while (rec) {
1100                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1101                                         break;
1102                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1103                                         break;
1104                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1105                         }
1106                         if (cursor->iprec) {
1107                                 KKASSERT(cursor->iprec == rec);
1108                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1109                                 cursor->scan.node =
1110                                         hammer_rec_rb_tree_RB_NEXT(rec);
1111                         } else {
1112                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1113                         }
1114                 }
1115         }
1116
1117         /*
1118          * Extract either the disk or memory record depending on their
1119          * relative position.
1120          */
1121         error = 0;
1122         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1123         case 0:
1124                 /*
1125                  * Both entries valid
1126                  */
1127                 elm = &cursor->node->ondisk->elms[cursor->index];
1128                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1129                 if (r < 0) {
1130                         error = hammer_btree_extract(cursor,
1131                                                      HAMMER_CURSOR_GET_RECORD);
1132                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1133                         break;
1134                 }
1135                 /* fall through to the memory entry */
1136         case HAMMER_CURSOR_ATEDISK:
1137                 /*
1138                  * Only the memory entry is valid
1139                  */
1140                 cursor->record = &cursor->iprec->rec;
1141                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1142                 break;
1143         case HAMMER_CURSOR_ATEMEM:
1144                 /*
1145                  * Only the disk entry is valid
1146                  */
1147                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1148                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1149                 break;
1150         default:
1151                 /*
1152                  * Neither entry is valid
1153                  *
1154                  * XXX error not set properly
1155                  */
1156                 cursor->record = NULL;
1157                 error = ENOENT;
1158                 break;
1159         }
1160         return(error);
1161 }
1162
1163 /*
1164  * Resolve the cursor->data pointer for the current cursor position in
1165  * a merged iteration.
1166  */
1167 int
1168 hammer_ip_resolve_data(hammer_cursor_t cursor)
1169 {
1170         int error;
1171
1172         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1173                 cursor->data = cursor->iprec->data;
1174                 error = 0;
1175         } else {
1176                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1177         }
1178         return(error);
1179 }
1180
1181 /*
1182  * Delete all records within the specified range for inode ip.
1183  *
1184  * NOTE: An unaligned range will cause new records to be added to cover
1185  * the edge cases. (XXX not implemented yet).
1186  *
1187  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1188  *
1189  * NOTE: Record keys for regular file data have to be special-cased since
1190  * they indicate the end of the range (key = base + bytes).
1191  *
1192  * NOTE: The spike structure must be filled in if we return ENOSPC.
1193  */
1194 int
1195 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1196                        int64_t ran_beg, int64_t ran_end,
1197                        struct hammer_cursor **spike)
1198 {
1199         struct hammer_cursor cursor;
1200         hammer_record_ondisk_t rec;
1201         hammer_base_elm_t base;
1202         int error;
1203         int64_t off;
1204
1205 retry:
1206         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1207
1208         cursor.key_beg.obj_id = ip->obj_id;
1209         cursor.key_beg.create_tid = 0;
1210         cursor.key_beg.delete_tid = 0;
1211         cursor.key_beg.obj_type = 0;
1212         cursor.asof = ip->obj_asof;
1213         cursor.flags |= HAMMER_CURSOR_ASOF;
1214
1215         cursor.key_end = cursor.key_beg;
1216         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1217                 cursor.key_beg.key = ran_beg;
1218                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1219                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1220                 cursor.key_end.key = ran_end;
1221         } else {
1222                 /*
1223                  * The key in the B-Tree is (base+bytes), so the first possible
1224                  * matching key is ran_beg + 1.
1225                  */
1226                 int64_t tmp64;
1227
1228                 cursor.key_beg.key = ran_beg + 1;
1229                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1230                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1231
1232                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1233                 if (tmp64 < ran_end)
1234                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1235                 else
1236                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1237         }
1238         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1239
1240         error = hammer_ip_first(&cursor, ip);
1241
1242         /*
1243          * Iterate through matching records and mark them as deleted.
1244          */
1245         while (error == 0) {
1246                 rec = cursor.record;
1247                 base = &rec->base.base;
1248
1249                 KKASSERT(base->delete_tid == 0);
1250
1251                 /*
1252                  * There may be overlap cases for regular file data.  Also
1253                  * remember the key for a regular file record is the offset
1254                  * of the last byte of the record (base + len - 1), NOT the
1255                  * base offset.
1256                  */
1257 #if 0
1258                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1259 #endif
1260                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1261 #if 0
1262                         kprintf("delete_range loop key %016llx\n",
1263                                 base->key - rec->base.data_len);
1264 #endif
1265                         off = base->key - rec->base.data_len;
1266                         /*
1267                          * Check the left edge case.  We currently do not
1268                          * split existing records.
1269                          */
1270                         if (off < ran_beg) {
1271                                 panic("hammer left edge case %016llx %d\n",
1272                                         base->key, rec->base.data_len);
1273                         }
1274
1275                         /*
1276                          * Check the right edge case.  Note that the
1277                          * record can be completely out of bounds, which
1278                          * terminates the search.
1279                          *
1280                          * base->key is exclusive of the right edge while
1281                          * ran_end is inclusive of the right edge.  The
1282                          * (key - data_len) left boundary is inclusive.
1283                          *
1284                          * XXX theory-check this test at some point, are
1285                          * we missing a + 1 somewhere?  Note that ran_end
1286                          * could overflow.
1287                          */
1288                         if (base->key - 1 > ran_end) {
1289                                 if (base->key - rec->base.data_len > ran_end)
1290                                         break;
1291                                 panic("hammer right edge case\n");
1292                         }
1293                 }
1294
1295                 /*
1296                  * Mark the record and B-Tree entry as deleted.  This will
1297                  * also physically delete the B-Tree entry, record, and
1298                  * data if the retention policy dictates.  The function
1299                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1300                  * uses to perform a fixup.
1301                  */
1302                 error = hammer_ip_delete_record(&cursor, trans->tid);
1303                 if (error)
1304                         break;
1305                 error = hammer_ip_next(&cursor);
1306         }
1307         hammer_done_cursor(&cursor);
1308         if (error == EDEADLK)
1309                 goto retry;
1310         if (error == ENOENT)
1311                 error = 0;
1312         return(error);
1313 }
1314
1315 /*
1316  * Delete all records associated with an inode except the inode record
1317  * itself.
1318  */
1319 int
1320 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1321 {
1322         struct hammer_cursor cursor;
1323         hammer_record_ondisk_t rec;
1324         hammer_base_elm_t base;
1325         int error;
1326
1327 retry:
1328         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1329
1330         cursor.key_beg.obj_id = ip->obj_id;
1331         cursor.key_beg.create_tid = 0;
1332         cursor.key_beg.delete_tid = 0;
1333         cursor.key_beg.obj_type = 0;
1334         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1335         cursor.key_beg.key = HAMMER_MIN_KEY;
1336
1337         cursor.key_end = cursor.key_beg;
1338         cursor.key_end.rec_type = 0xFFFF;
1339         cursor.key_end.key = HAMMER_MAX_KEY;
1340
1341         cursor.asof = ip->obj_asof;
1342         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1343
1344         error = hammer_ip_first(&cursor, ip);
1345
1346         /*
1347          * Iterate through matching records and mark them as deleted.
1348          */
1349         while (error == 0) {
1350                 rec = cursor.record;
1351                 base = &rec->base.base;
1352
1353                 KKASSERT(base->delete_tid == 0);
1354
1355                 /*
1356                  * Mark the record and B-Tree entry as deleted.  This will
1357                  * also physically delete the B-Tree entry, record, and
1358                  * data if the retention policy dictates.  The function
1359                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1360                  * uses to perform a fixup.
1361                  */
1362                 error = hammer_ip_delete_record(&cursor, trans->tid);
1363                 if (error)
1364                         break;
1365                 error = hammer_ip_next(&cursor);
1366         }
1367         hammer_done_cursor(&cursor);
1368         if (error == EDEADLK)
1369                 goto retry;
1370         if (error == ENOENT)
1371                 error = 0;
1372         return(error);
1373 }
1374
1375 /*
1376  * Delete the record at the current cursor.  On success the cursor will
1377  * be positioned appropriately for an iteration but may no longer be at
1378  * a leaf node.
1379  *
1380  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1381  * cursor and retry.
1382  */
1383 int
1384 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1385 {
1386         hammer_btree_elm_t elm;
1387         hammer_mount_t hmp;
1388         int error;
1389         int dodelete;
1390
1391         /*
1392          * In-memory (unsynchronized) records can simply be freed.
1393          */
1394         if (cursor->record == &cursor->iprec->rec) {
1395                 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1396                 return(0);
1397         }
1398
1399         /*
1400          * On-disk records are marked as deleted by updating their delete_tid.
1401          * This does not effect their position in the B-Tree (which is based
1402          * on their create_tid).
1403          */
1404         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1405         elm = NULL;
1406         hmp = cursor->node->cluster->volume->hmp;
1407
1408         dodelete = 0;
1409         if (error == 0) {
1410                 error = hammer_cursor_upgrade(cursor);
1411                 if (error == 0) {
1412                         hammer_modify_node(cursor->node);
1413                         elm = &cursor->node->ondisk->elms[cursor->index];
1414                         elm->leaf.base.delete_tid = tid;
1415                         hammer_modify_buffer(cursor->record_buffer);
1416                         cursor->record->base.base.delete_tid = tid;
1417                 }
1418         }
1419
1420         /*
1421          * If we were mounted with the nohistory option, we physically
1422          * delete the record.
1423          */
1424         if (hmp->hflags & HMNT_NOHISTORY)
1425                 dodelete = 1;
1426
1427         if (error == 0 && dodelete) {
1428                 error = hammer_delete_at_cursor(cursor, NULL);
1429                 if (error) {
1430                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1431                         error = 0;
1432                 }
1433         }
1434         return(error);
1435 }
1436
1437 int
1438 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1439 {
1440         hammer_btree_elm_t elm;
1441         int32_t rec_offset;
1442         int32_t data_offset;
1443         int32_t data_len;
1444         u_int8_t rec_type;
1445         hammer_cluster_t cluster;
1446         int error;
1447
1448         elm = &cursor->node->ondisk->elms[cursor->index];
1449         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1450
1451         rec_offset = elm->leaf.rec_offset;
1452         data_offset = elm->leaf.data_offset;
1453         data_len = elm->leaf.data_len;
1454         rec_type = elm->leaf.base.rec_type;
1455
1456         /*
1457          * We must ref the cluster to prevent it from being
1458          * freed prior to our freeing the last record.
1459          */
1460         cluster = cursor->node->cluster;
1461         hammer_ref_cluster(cluster);
1462
1463         error = hammer_btree_delete(cursor);
1464         if (error == 0) {
1465                 /*
1466                  * This forces a fixup for the iteration because
1467                  * the cursor is now either sitting at the 'next'
1468                  * element or sitting at the end of a leaf.
1469                  */
1470                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1471                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1472                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1473                 }
1474                 hammer_free_record(cluster, rec_offset, rec_type);
1475                 if (data_offset && (data_offset - rec_offset < 0 ||
1476                     data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1477                         hammer_free_data(cluster, data_offset,data_len);
1478                         if (stat_bytes)
1479                                 *stat_bytes += data_len;
1480                 }
1481         }
1482 #if 0
1483         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1484                 "(%d remain in cluster)\n",
1485                 cluster->volume->vol_no, cluster->clu_no,
1486                 rec_offset, data_offset, data_len,
1487                 cluster->ondisk->stat_records);
1488 #endif
1489         hammer_rel_cluster(cluster, 0);
1490         return (error);
1491 }
1492
1493 /*
1494  * Determine whether a directory is empty or not.  Returns 0 if the directory
1495  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1496  */
1497 int
1498 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1499 {
1500         struct hammer_cursor cursor;
1501         int error;
1502
1503         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1504
1505         cursor.key_beg.obj_id = ip->obj_id;
1506         cursor.key_beg.create_tid = 0;
1507         cursor.key_beg.delete_tid = 0;
1508         cursor.key_beg.obj_type = 0;
1509         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1510         cursor.key_beg.key = HAMMER_MIN_KEY;
1511
1512         cursor.key_end = cursor.key_beg;
1513         cursor.key_end.rec_type = 0xFFFF;
1514         cursor.key_end.key = HAMMER_MAX_KEY;
1515
1516         cursor.asof = ip->obj_asof;
1517         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1518
1519         error = hammer_ip_first(&cursor, ip);
1520         if (error == ENOENT)
1521                 error = 0;
1522         else if (error == 0)
1523                 error = ENOTEMPTY;
1524         hammer_done_cursor(&cursor);
1525         return(error);
1526 }
1527