08b352cb007ab2399dd2bf591e6adbdefa72abc4
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.47 2008/04/27 21:07:15 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71
72         /*
73          * Never match against an item deleted by the front-end.
74          */
75         if (rec1->flags & HAMMER_RECF_DELETED_FE)
76                 return(1);
77         if (rec2->flags & HAMMER_RECF_DELETED_FE)
78                 return(-1);
79
80         return(0);
81 }
82
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86         if (info->rec_type < rec->rec.base.base.rec_type)
87                 return(-3);
88         if (info->rec_type > rec->rec.base.base.rec_type)
89                 return(3);
90
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95
96         if (info->create_tid == 0) {
97                 if (rec->rec.base.base.create_tid == 0)
98                         return(0);
99                 return(1);
100         }
101         if (rec->rec.base.base.create_tid == 0)
102                 return(-1);
103         if (info->create_tid < rec->rec.base.base.create_tid)
104                 return(-1);
105         if (info->create_tid > rec->rec.base.base.create_tid)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124         hammer_cursor_t cursor = data;
125         int r;
126
127         r = hammer_rec_compare(&cursor->key_beg, rec);
128         if (r > 1)
129                 return(-1);
130         r = hammer_rec_compare(&cursor->key_end, rec);
131         if (r < -1)
132                 return(1);
133         return(0);
134 }
135
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143         hammer_cursor_t cursor = data;
144         int r;
145
146         r = hammer_rec_compare(&cursor->key_beg, rec);
147         if (r > 1)
148                 return(-1);
149         if (r < -1)
150                 return(1);
151         return(0);
152 }
153
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156                     hammer_rec_compare, hammer_base_elm_t);
157
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165         hammer_record_t record;
166
167         ++hammer_count_records;
168         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169         record->state = HAMMER_FST_IDLE;
170         record->ip = ip;
171         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172         TAILQ_INIT(&record->depend_list);
173         hammer_ref(&record->lock);
174         return (record);
175 }
176
177 void
178 hammer_wait_mem_record(hammer_record_t record)
179 {
180         while (record->state == HAMMER_FST_FLUSH) {
181                 record->flags |= HAMMER_RECF_WANTED;
182                 tsleep(record, 0, "hmrrc2", 0);
183         }
184 }
185
186 /*
187  * Called from the backend, hammer_inode.c, after a record has been
188  * flushed to disk.  The record has been exclusively locked by the
189  * caller and interlocked with BE.
190  *
191  * We clean up the state, unlock, and release the record (the record
192  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
193  */
194 void
195 hammer_flush_record_done(hammer_record_t record, int error)
196 {
197         KKASSERT(record->state == HAMMER_FST_FLUSH);
198         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
199
200         if (error) {
201                 /*
202                  * An error occured, the backend was unable to sync the
203                  * record to its media.  Leave the record intact.
204                  */
205         } else if (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK) {
206                 /*
207                  * deleted-record to delete-on-disk conversion, occurs when
208                  * we sync a record to disk which is marked deleted by the
209                  * frontend, but not deleted from the point of view of the
210                  * backend.
211                  */
212                 if (record->flags & HAMMER_RECF_DELETED_BE) {
213                         record->flags |= HAMMER_RECF_DELETED_FE;
214                         hammer_cleardep_mem_record(record);
215                 } else {
216                         KKASSERT(record->flags & HAMMER_RECF_DELETE_ONDISK);
217                 }
218         } else {
219                 /*
220                  * Normal completion, record has been disposed of (by
221                  * having been synchronized to the media).
222                  */
223                 record->flags |= HAMMER_RECF_DELETED_FE;
224                 hammer_cleardep_mem_record(record);
225         }
226         record->state = HAMMER_FST_IDLE;
227         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
228         record->flags &= ~HAMMER_RECF_CONVERT_DELETE_ONDISK;
229         hammer_unlock(&record->lock);
230         if (record->flags & HAMMER_RECF_WANTED) {
231                 record->flags &= ~HAMMER_RECF_WANTED;
232                 wakeup(record);
233         }
234         hammer_rel_mem_record(record);
235 }
236
237 /*
238  * Clear dependancies associated with a memory record.
239  */
240 void
241 hammer_cleardep_mem_record(struct hammer_record *record)
242 {
243         hammer_depend_t depend;
244
245         while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
246                 TAILQ_REMOVE(&record->depend_list, depend,
247                              rec_entry);
248                 TAILQ_REMOVE(&depend->ip->depend_list, depend,
249                              ip_entry);
250                 --depend->ip->depend_count;
251                 /* NOTE: inode is not flushed */
252                 hammer_rel_inode(depend->ip, 0);
253                 hammer_unref(&record->lock);
254                 KKASSERT(record->lock.refs > 0);
255                 kfree(depend, M_HAMMER);
256         }
257 }
258
259 /*
260  * Release a memory record.  Records marked for deletion are immediately
261  * removed from the RB-Tree but otherwise left intact until the last ref
262  * goes away.
263  */
264 void
265 hammer_rel_mem_record(struct hammer_record *record)
266 {
267         hammer_unref(&record->lock);
268
269         if (record->flags & HAMMER_RECF_DELETED_FE) {
270                 if (record->lock.refs == 0) {
271                         KKASSERT(record->state == HAMMER_FST_IDLE);
272                         KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
273
274                         if (record->flags & HAMMER_RECF_ONRBTREE) {
275                                 RB_REMOVE(hammer_rec_rb_tree,
276                                           &record->ip->rec_tree,
277                                           record);
278                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
279                         }
280                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
281                                 --hammer_count_record_datas;
282                                 kfree(record->data, M_HAMMER);
283                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
284                         }
285                         record->data = NULL;
286                         --hammer_count_records;
287                         kfree(record, M_HAMMER);
288                         return;
289                 }
290         }
291 }
292
293 /*
294  * Record visibility depends on whether the record is being accessed by
295  * the backend or the frontend.
296  *
297  * Return non-zero if the record is visible, zero if it isn't or if it is
298  * deleted.
299  */
300 static __inline
301 int
302 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
303 {
304         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
305                 if (record->flags & HAMMER_RECF_DELETED_BE)
306                         return(0);
307                 if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0)
308                         return(0);
309         } else {
310                 if (record->flags & HAMMER_RECF_DELETED_FE)
311                         return(0);
312         }
313         return(1);
314 }
315
316 /*
317  * This callback is used as part of the RB_SCAN function for in-memory
318  * records.  We terminate it (return -1) as soon as we get a match.
319  *
320  * This routine is used by frontend code.
321  *
322  * The primary compare code does not account for ASOF lookups.  This
323  * code handles that case as well as a few others.
324  */
325 static
326 int
327 hammer_rec_scan_callback(hammer_record_t rec, void *data)
328 {
329         hammer_cursor_t cursor = data;
330
331         /*
332          * We terminate on success, so this should be NULL on entry.
333          */
334         KKASSERT(cursor->iprec == NULL);
335
336         /*
337          * Skip if the record was marked deleted.
338          */
339         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
340                 return(0);
341
342         /*
343          * Skip if not visible due to our as-of TID
344          */
345         if (cursor->flags & HAMMER_CURSOR_ASOF) {
346                 if (cursor->asof < rec->rec.base.base.create_tid)
347                         return(0);
348                 if (rec->rec.base.base.delete_tid &&
349                     cursor->asof >= rec->rec.base.base.delete_tid) {
350                         return(0);
351                 }
352         }
353
354         /*
355          * If the record is queued to the flusher we have to block until
356          * it isn't.  Otherwise we may see duplication between our memory
357          * cache and the media.
358          */
359         hammer_ref(&rec->lock);
360
361 #warning "This deadlocks"
362 #if 0
363         if (rec->state == HAMMER_FST_FLUSH)
364                 hammer_wait_mem_record(rec);
365 #endif
366
367         /*
368          * The record may have been deleted while we were blocked.
369          */
370         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
371                 hammer_rel_mem_record(rec);
372                 return(0);
373         }
374
375         /*
376          * Set the matching record and stop the scan.
377          */
378         cursor->iprec = rec;
379         return(-1);
380 }
381
382
383 /*
384  * Lookup an in-memory record given the key specified in the cursor.  Works
385  * just like hammer_btree_lookup() but operates on an inode's in-memory
386  * record list.
387  *
388  * The lookup must fail if the record is marked for deferred deletion.
389  */
390 static
391 int
392 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
393 {
394         int error;
395
396         if (cursor->iprec) {
397                 hammer_rel_mem_record(cursor->iprec);
398                 cursor->iprec = NULL;
399         }
400         if (cursor->ip) {
401                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
402                 --cursor->ip->cursor_ip_refs;
403 #if 0
404                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
405                                                   &cursor->ip->rec_tree);
406 #endif
407         }
408         cursor->ip = ip;
409 #if 0
410         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
411 #endif
412         ++ip->cursor_ip_refs;
413
414 #if 0
415         cursor->scan.node = NULL;
416 #endif
417         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
418                                    hammer_rec_scan_callback, cursor);
419
420         if (cursor->iprec == NULL)
421                 error = ENOENT;
422         else
423                 error = 0;
424         return(error);
425 }
426
427 /*
428  * hammer_mem_first() - locate the first in-memory record matching the
429  * cursor within the bounds of the key range.
430  */
431 static
432 int
433 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
434 {
435         if (cursor->iprec) {
436                 hammer_rel_mem_record(cursor->iprec);
437                 cursor->iprec = NULL;
438         }
439         if (cursor->ip) {
440                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
441                 --cursor->ip->cursor_ip_refs;
442 #if 0
443                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
444                                                   &cursor->ip->rec_tree);
445 #endif
446         }
447         cursor->ip = ip;
448 #if 0
449         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
450 #endif
451         ++ip->cursor_ip_refs;
452
453 #if 0
454         cursor->scan.node = NULL;
455 #endif
456         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
457                                    hammer_rec_scan_callback, cursor);
458
459         /*
460          * Adjust scan.node and keep it linked into the RB-tree so we can
461          * hold the cursor through third party modifications of the RB-tree.
462          */
463         if (cursor->iprec) {
464 #if 0
465                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
466 #endif
467                 return(0);
468         }
469         return(ENOENT);
470 }
471
472 void
473 hammer_mem_done(hammer_cursor_t cursor)
474 {
475         if (cursor->ip) {
476                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
477                 --cursor->ip->cursor_ip_refs;
478 #if 0
479                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
480                                                   &cursor->ip->rec_tree);
481 #endif
482                 cursor->ip = NULL;
483         }
484         if (cursor->iprec) {
485                 hammer_rel_mem_record(cursor->iprec);
486                 cursor->iprec = NULL;
487         }
488 }
489
490 /************************************************************************
491  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
492  ************************************************************************
493  *
494  * These functions manipulate in-memory records.  Such records typically
495  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
496  */
497
498 /*
499  * Add a directory entry (dip,ncp) which references inode (ip).
500  *
501  * Note that the low 32 bits of the namekey are set temporarily to create
502  * a unique in-memory record, and may be modified a second time when the
503  * record is synchronized to disk.  In particular, the low 32 bits cannot be
504  * all 0's when synching to disk, which is not handled here.
505  */
506 int
507 hammer_ip_add_directory(struct hammer_transaction *trans,
508                      struct hammer_inode *dip, struct namecache *ncp,
509                      struct hammer_inode *ip)
510 {
511         hammer_record_t record;
512         hammer_depend_t depend;
513         int error;
514         int bytes;
515
516         record = hammer_alloc_mem_record(dip);
517         depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
518
519         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
520         if (++trans->hmp->namekey_iterator == 0)
521                 ++trans->hmp->namekey_iterator;
522
523         record->rec.entry.base.base.obj_id = dip->obj_id;
524         record->rec.entry.base.base.key =
525                 hammer_directory_namekey(ncp->nc_name, bytes);
526         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
527         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
528         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
529         record->rec.entry.obj_id = ip->obj_id;
530         record->data = (void *)ncp->nc_name;
531         record->rec.entry.base.data_len = bytes;
532         ++ip->ino_rec.ino_nlinks;
533         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
534         /* NOTE: copies record->data */
535
536         /*
537          * If the inode gets synced cause the directory entry
538          * to be synced as well, or vise-versa.
539          */
540         hammer_ref(&record->lock);      /* for depend entry */
541         hammer_ref(&ip->lock);          /* for depend entry */
542         depend->ip = ip;
543         depend->record = record;
544         TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
545         TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
546         ++ip->depend_count;
547
548         error = hammer_mem_add(trans, record);
549         return(error);
550 }
551
552 /*
553  * Delete the directory entry and update the inode link count.  The
554  * cursor must be seeked to the directory entry record being deleted.
555  *
556  * The related inode should be share-locked by the caller.  The caller is
557  * on the frontend.
558  *
559  * This function can return EDEADLK requiring the caller to terminate
560  * the cursor, any locks, wait on the returned record, and retry.
561  */
562 int
563 hammer_ip_del_directory(struct hammer_transaction *trans,
564                      hammer_cursor_t cursor, struct hammer_inode *dip,
565                      struct hammer_inode *ip)
566 {
567         hammer_record_t record;
568         hammer_depend_t depend;
569         int error;
570
571         if (cursor->record == &cursor->iprec->rec) {
572                 /*
573                  * In-memory (unsynchronized) records can simply be freed.
574                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
575                  * by the backend, we must still avoid races against the
576                  * backend potentially syncing the record to the media. 
577                  *
578                  * We cannot call hammer_ip_delete_record(), that routine may
579                  * only be called from the backend.
580                  */
581                 record = cursor->iprec;
582                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
583                         KKASSERT(cursor->deadlk_rec == NULL);
584                         hammer_ref(&record->lock);
585                         cursor->deadlk_rec = record;
586                         error = EDEADLK;
587                 } else {
588                         record->flags |= HAMMER_RECF_DELETED_FE;
589                         hammer_cleardep_mem_record(record);
590                         error = 0;
591                 }
592         } else {
593                 /*
594                  * If the record is on-disk we have to queue the deletion by
595                  * the record's key.  This also causes lookups to skip the
596                  * record.
597                  */
598                 depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
599
600                 record = hammer_alloc_mem_record(dip);
601                 record->rec.entry.base.base = cursor->record->base.base;
602                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
603                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
604
605                 /*
606                  * If the inode gets synced cause the directory entry
607                  * to be synced as well, or vise-versa.
608                  */
609                 hammer_ref(&ip->lock);          /* for depend entry */
610                 hammer_ref(&record->lock);      /* for depend entry */
611                 depend->ip = ip;
612                 depend->record = record;
613                 TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
614                 TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
615                 ++ip->depend_count;
616
617                 error = hammer_mem_add(trans, record);
618         }
619
620         /*
621          * One less link.  The file may still be open in the OS even after
622          * all links have gone away so we only try to sync if the OS has
623          * no references and nlinks falls to 0.
624          *
625          * We have to terminate the cursor before syncing the inode to
626          * avoid deadlocking against ourselves.
627          *
628          * XXX we can't sync the inode here because the encompassing
629          * transaction might be a rename and might update the inode
630          * again with a new link.  That would force the delete_tid to be
631          * the same as the create_tid and cause a panic.
632          */
633         if (error == 0) {
634                 --ip->ino_rec.ino_nlinks;
635                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
636                 if (ip->ino_rec.ino_nlinks == 0 &&
637                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
638                         hammer_done_cursor(cursor);
639                 }
640
641         }
642         return(error);
643 }
644
645 /*
646  * Add a record to an inode.
647  *
648  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
649  * initialize the following additional fields:
650  *
651  * The related inode should be share-locked by the caller.  The caller is
652  * on the frontend.
653  *
654  * record->rec.entry.base.base.key
655  * record->rec.entry.base.base.rec_type
656  * record->rec.entry.base.base.data_len
657  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
658  */
659 int
660 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
661 {
662         hammer_inode_t ip = record->ip;
663         int error;
664
665         record->rec.base.base.obj_id = ip->obj_id;
666         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
667
668         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
669         /* NOTE: copies record->data */
670         error = hammer_mem_add(trans, record);
671         return(error);
672 }
673
674 /*
675  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
676  * is called via the strategy called from a cached data source.  This code
677  * is responsible for actually writing a data record out to the disk.
678  *
679  * This can only occur non-historically (i.e. 'current' data only).
680  *
681  * The file offset must be HAMMER_BUFSIZE aligned but the data length
682  * can be truncated.  The record (currently) always represents a BUFSIZE
683  * swath of space whether the data is truncated or not.
684  */
685 int
686 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
687                        int64_t offset, void *data, int bytes)
688 {
689         struct hammer_cursor cursor;
690         hammer_record_ondisk_t rec;
691         union hammer_btree_elm elm;
692         hammer_off_t rec_offset;
693         void *bdata;
694         int error;
695
696         KKASSERT((offset & HAMMER_BUFMASK) == 0);
697         KKASSERT(trans->type == HAMMER_TRANS_FLS);
698 retry:
699         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
700         if (error)
701                 return(error);
702         cursor.key_beg.obj_id = ip->obj_id;
703         cursor.key_beg.key = offset + bytes;
704         cursor.key_beg.create_tid = trans->tid;
705         cursor.key_beg.delete_tid = 0;
706         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
707         cursor.asof = trans->tid;
708         cursor.flags |= HAMMER_CURSOR_INSERT;
709         cursor.flags |= HAMMER_CURSOR_BACKEND;
710
711         /*
712          * Issue a lookup to position the cursor.
713          */
714         error = hammer_btree_lookup(&cursor);
715         if (error == 0) {
716                 kprintf("hammer_ip_sync_data: duplicate data at "
717                         "(%lld,%d) tid %016llx\n",
718                         offset, bytes, trans->tid);
719                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
720                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
721                 panic("Duplicate data");
722                 error = EIO;
723         }
724         if (error != ENOENT)
725                 goto done;
726
727         /*
728          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
729          * can cross buffer boundaries so we may have to split our bcopy.
730          */
731         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
732                                   &cursor.record_buffer,
733                                   bytes, &bdata,
734                                   &cursor.data_buffer, &error);
735         if (rec == NULL)
736                 goto done;
737         if (hammer_debug_general & 0x1000)
738                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
739
740         /*
741          * Fill everything in and insert our B-Tree node.
742          *
743          * NOTE: hammer_alloc_record() has already marked the related
744          * buffers as modified.  If we do it again we will generate
745          * unnecessary undo elements.
746          */
747         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
748         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
749         rec->base.base.obj_id = ip->obj_id;
750         rec->base.base.key = offset + bytes;
751         rec->base.base.create_tid = trans->tid;
752         rec->base.base.delete_tid = 0;
753         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
754         rec->base.data_crc = crc32(data, bytes);
755         hammer_modify_buffer_done(cursor.record_buffer);
756         KKASSERT(rec->base.data_len == bytes);
757
758         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
759         bcopy(data, bdata, bytes);
760         hammer_modify_buffer_done(cursor.data_buffer);
761
762         elm.leaf.base = rec->base.base;
763         elm.leaf.rec_offset = rec_offset;
764         elm.leaf.data_offset = rec->base.data_off;
765         elm.leaf.data_len = bytes;
766         elm.leaf.data_crc = rec->base.data_crc;
767
768         /*
769          * Data records can wind up on-disk before the inode itself is
770          * on-disk.  One must assume data records may be on-disk if either
771          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
772          */
773         ip->flags |= HAMMER_INODE_DONDISK;
774
775         error = hammer_btree_insert(&cursor, &elm);
776         if (error == 0)
777                 goto done;
778
779         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
780 done:
781         hammer_done_cursor(&cursor);
782         if (error == EDEADLK)
783                 goto retry;
784         return(error);
785 }
786
787 /*
788  * Sync an in-memory record to the disk.  This is called by the backend.
789  * This code is responsible for actually writing a record out to the disk.
790  *
791  * Any inode dependancies will queue the inode to the backend.
792  *
793  * This routine can only be called by the backend and the record
794  * must have been interlocked with BE.  It will remain interlocked on
795  * return.  The caller is responsible for the record's disposition.
796  */
797 int
798 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
799 {
800         struct hammer_cursor cursor;
801         hammer_record_ondisk_t rec;
802         union hammer_btree_elm elm;
803         hammer_depend_t depend;
804         hammer_off_t rec_offset;
805         void *bdata;
806         int error;
807
808         KKASSERT(record->state == HAMMER_FST_FLUSH);
809         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
810
811         /*
812          * XXX A record with a dependancy is typically a directory record.
813          * The related inode must also be synchronized.  This code is not
814          * currently synchronizing the inode atomically. XXX
815          *
816          * XXX Additional dependancies from the frontend might be added while
817          * the backend is syncing the record?
818          */
819         while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
820                 TAILQ_REMOVE(&record->depend_list, depend, rec_entry);
821                 TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry);
822                 --depend->ip->depend_count;
823                 kprintf("S");
824                 KKASSERT((depend->ip->flags & HAMMER_INODE_NEW) == 0);
825                 hammer_flush_inode(depend->ip, 0);
826                 hammer_rel_inode(depend->ip, 0);
827                 hammer_unref(&record->lock);
828                 KKASSERT(record->lock.refs > 0);
829                 kfree(depend, M_HAMMER);
830         }
831
832 retry:
833         /*
834          * Get a cursor, we will either be inserting or deleting.
835          */
836         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
837         if (error)
838                 return(error);
839         cursor.key_beg = record->rec.base.base;
840         cursor.flags |= HAMMER_CURSOR_BACKEND;
841
842         /*
843          * If we are deleting an exact match must be found on-disk.
844          */
845         if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
846                 error = hammer_btree_lookup(&cursor);
847                 if (error == 0)
848                         error = hammer_ip_delete_record(&cursor, trans->tid);
849                 goto done;
850         }
851
852         /*
853          * We are inserting.
854          *
855          * Issue a lookup to position the cursor and locate the cluster.  The
856          * target key should not exist.  If we are creating a directory entry
857          * we may have to iterate the low 32 bits of the key to find an unused
858          * key.
859          */
860         cursor.flags |= HAMMER_CURSOR_INSERT;
861
862         for (;;) {
863                 error = hammer_btree_lookup(&cursor);
864                 if (error)
865                         break;
866                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
867                         kprintf("hammer_ip_sync_record: duplicate rec "
868                                 "at (%016llx)\n", record->rec.base.base.key);
869                         Debugger("duplicate record1");
870                         error = EIO;
871                         break;
872                 }
873                 if (++trans->hmp->namekey_iterator == 0)
874                         ++trans->hmp->namekey_iterator;
875                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
876                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
877                 cursor.key_beg.key = record->rec.base.base.key;
878         }
879         if (error != ENOENT)
880                 goto done;
881
882         /*
883          * Allocate the record and data.  The result buffers will be
884          * marked as being modified and further calls to
885          * hammer_modify_buffer() will result in unneeded UNDO records.
886          *
887          * Support zero-fill records (data == NULL and data_len != 0)
888          */
889         if (record->data == NULL) {
890                 rec = hammer_alloc_record(trans, &rec_offset,
891                                           record->rec.base.base.rec_type,
892                                           &cursor.record_buffer,
893                                           0, &bdata,
894                                           NULL, &error);
895                 if (hammer_debug_general & 0x1000)
896                         kprintf("NULL RECORD DATA\n");
897         } else if (record->flags & HAMMER_RECF_INBAND) {
898                 rec = hammer_alloc_record(trans, &rec_offset,
899                                           record->rec.base.base.rec_type,
900                                           &cursor.record_buffer,
901                                           record->rec.base.data_len, &bdata,
902                                           NULL, &error);
903                 if (hammer_debug_general & 0x1000)
904                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
905         } else {
906                 rec = hammer_alloc_record(trans, &rec_offset,
907                                           record->rec.base.base.rec_type,
908                                           &cursor.record_buffer,
909                                           record->rec.base.data_len, &bdata,
910                                           &cursor.data_buffer, &error);
911                 if (hammer_debug_general & 0x1000)
912                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
913         }
914
915         if (rec == NULL)
916                 goto done;
917
918         /*
919          * Fill in the remaining fields and insert our B-Tree node.
920          */
921         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
922         rec->base.base = record->rec.base.base;
923         bcopy(&record->rec.base + 1, &rec->base + 1,
924               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
925
926         /*
927          * Copy the data and deal with zero-fill support.
928          */
929         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
930                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
931                 bcopy(record->data, bdata, rec->base.data_len);
932         } else if (record->data) {
933                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
934                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
935                 bcopy(record->data, bdata, rec->base.data_len);
936                 hammer_modify_buffer_done(cursor.data_buffer);
937         } else {
938                 rec->base.data_len = record->rec.base.data_len;
939         }
940         hammer_modify_buffer_done(cursor.record_buffer);
941
942         elm.leaf.base = record->rec.base.base;
943         elm.leaf.rec_offset = rec_offset;
944         elm.leaf.data_offset = rec->base.data_off;
945         elm.leaf.data_len = rec->base.data_len;
946         elm.leaf.data_crc = rec->base.data_crc;
947
948         error = hammer_btree_insert(&cursor, &elm);
949
950         /*
951          * This occurs when the frontend creates a record and queues it to
952          * the backend, then tries to delete the record.  The backend must
953          * still sync the record to the media as if it were not deleted,
954          * but must interlock with the frontend to ensure that the 
955          * synchronized record is not visible to the frontend, which means
956          * converted the 'deleted' record to a delete-on-disk record.
957          */
958         if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE_ONDISK)) {
959                 KKASSERT((record->flags & HAMMER_RECF_DELETE_ONDISK) == 0);
960                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
961                 record->flags &= ~HAMMER_RECF_DELETED_FE;
962         }
963
964         /*
965          * If the error occured unwind the operation.
966          */
967         if (error)
968                 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
969
970 done:
971         hammer_done_cursor(&cursor);
972         if (error == EDEADLK)
973                 goto retry;
974         return(error);
975 }
976
977 /*
978  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
979  * entry's key is used to deal with hash collisions in the upper 32 bits.
980  * A unique 64 bit key is generated in-memory and may be regenerated a
981  * second time when the directory record is flushed to the on-disk B-Tree.
982  *
983  * A referenced record is passed to this function.  This function
984  * eats the reference.  If an error occurs the record will be deleted.
985  *
986  * A copy of the temporary record->data pointer provided by the caller
987  * will be made.
988  */
989 static
990 int
991 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
992 {
993         void *data;
994         int bytes;
995         int reclen;
996                 
997         /*
998          * Make a private copy of record->data
999          */
1000         if (record->data) {
1001                 /*
1002                  * Try to embed the data in extra space in the record
1003                  * union, otherwise allocate a copy.
1004                  */
1005                 bytes = record->rec.base.data_len;
1006                 switch(record->rec.base.base.rec_type) {
1007                 case HAMMER_RECTYPE_DIRENTRY:
1008                         reclen = offsetof(struct hammer_entry_record, name[0]);
1009                         break;
1010                 case HAMMER_RECTYPE_DATA:
1011                         reclen = offsetof(struct hammer_data_record, data[0]);
1012                         break;
1013                 default:
1014                         reclen = sizeof(record->rec);
1015                         break;
1016                 }
1017                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
1018                         bcopy(record->data, (char *)&record->rec + reclen,
1019                               bytes);
1020                         record->data = (void *)((char *)&record->rec + reclen);
1021                         record->flags |= HAMMER_RECF_INBAND;
1022                 } else {
1023                         ++hammer_count_record_datas;
1024                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
1025                         record->flags |= HAMMER_RECF_ALLOCDATA;
1026                         bcopy(record->data, data, bytes);
1027                         record->data = data;
1028                 }
1029         }
1030
1031         /*
1032          * Insert into the RB tree, find an unused iterator if this is
1033          * a directory entry.
1034          */
1035         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1036                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
1037                         record->flags |= HAMMER_RECF_DELETED_FE;
1038                         KKASSERT(TAILQ_FIRST(&record->depend_list) == NULL);
1039                         hammer_rel_mem_record(record);
1040                         return (EEXIST);
1041                 }
1042                 if (++trans->hmp->namekey_iterator == 0)
1043                         ++trans->hmp->namekey_iterator;
1044                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
1045                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
1046         }
1047         record->flags |= HAMMER_RECF_ONRBTREE;
1048         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
1049         hammer_rel_mem_record(record);
1050         return(0);
1051 }
1052
1053 /************************************************************************
1054  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1055  ************************************************************************
1056  *
1057  * These functions augment the B-Tree scanning functions in hammer_btree.c
1058  * by merging in-memory records with on-disk records.
1059  */
1060
1061 /*
1062  * Locate a particular record either in-memory or on-disk.
1063  *
1064  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1065  * NOT be called to iterate results.
1066  */
1067 int
1068 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1069 {
1070         int error;
1071
1072         /*
1073          * If the element is in-memory return it without searching the
1074          * on-disk B-Tree
1075          */
1076         error = hammer_mem_lookup(cursor, ip);
1077         if (error == 0) {
1078                 cursor->record = &cursor->iprec->rec;
1079                 return(error);
1080         }
1081         if (error != ENOENT)
1082                 return(error);
1083
1084         /*
1085          * If the inode has on-disk components search the on-disk B-Tree.
1086          */
1087         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1088                 return(error);
1089         error = hammer_btree_lookup(cursor);
1090         if (error == 0)
1091                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1092         return(error);
1093 }
1094
1095 /*
1096  * Locate the first record within the cursor's key_beg/key_end range,
1097  * restricted to a particular inode.  0 is returned on success, ENOENT
1098  * if no records matched the requested range, or some other error.
1099  *
1100  * When 0 is returned hammer_ip_next() may be used to iterate additional
1101  * records within the requested range.
1102  *
1103  * This function can return EDEADLK, requiring the caller to terminate
1104  * the cursor and try again.
1105  */
1106 int
1107 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
1108 {
1109         int error;
1110
1111         /*
1112          * Clean up fields and setup for merged scan
1113          */
1114         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1115         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1116         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1117         if (cursor->iprec) {
1118                 hammer_rel_mem_record(cursor->iprec);
1119                 cursor->iprec = NULL;
1120         }
1121
1122         /*
1123          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1124          * exact lookup so if we get ENOENT we have to call the iterate
1125          * function to validate the first record after the begin key.
1126          *
1127          * The ATEDISK flag is used by hammer_btree_iterate to determine
1128          * whether it must index forwards or not.  It is also used here
1129          * to select the next record from in-memory or on-disk.
1130          *
1131          * EDEADLK can only occur if the lookup hit an empty internal
1132          * element and couldn't delete it.  Since this could only occur
1133          * in-range, we can just iterate from the failure point.
1134          */
1135         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1136                 error = hammer_btree_lookup(cursor);
1137                 if (error == ENOENT || error == EDEADLK) {
1138                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1139                         if (hammer_debug_general & 0x2000)
1140                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1141                         error = hammer_btree_iterate(cursor);
1142                 }
1143                 if (error && error != ENOENT) 
1144                         return(error);
1145                 if (error == 0) {
1146                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1147                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1148                 } else {
1149                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1150                 }
1151         }
1152
1153         /*
1154          * Search the in-memory record list (Red-Black tree).  Unlike the
1155          * B-Tree search, mem_first checks for records in the range.
1156          */
1157         error = hammer_mem_first(cursor, ip);
1158         if (error && error != ENOENT)
1159                 return(error);
1160         if (error == 0) {
1161                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1162                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1163                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1164                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1165         }
1166
1167         /*
1168          * This will return the first matching record.
1169          */
1170         return(hammer_ip_next(cursor));
1171 }
1172
1173 /*
1174  * Retrieve the next record in a merged iteration within the bounds of the
1175  * cursor.  This call may be made multiple times after the cursor has been
1176  * initially searched with hammer_ip_first().
1177  *
1178  * 0 is returned on success, ENOENT if no further records match the
1179  * requested range, or some other error code is returned.
1180  */
1181 int
1182 hammer_ip_next(hammer_cursor_t cursor)
1183 {
1184         hammer_btree_elm_t elm;
1185         hammer_record_t rec, save;
1186         int error;
1187         int r;
1188
1189 next_btree:
1190         /*
1191          * Load the current on-disk and in-memory record.  If we ate any
1192          * records we have to get the next one. 
1193          *
1194          * If we deleted the last on-disk record we had scanned ATEDISK will
1195          * be clear and DELBTREE will be set, forcing a call to iterate. The
1196          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1197          * element.  If ATEDISK is set, iterate will skip the 'current'
1198          * element.
1199          *
1200          * Get the next on-disk record
1201          */
1202         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1203                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1204                         error = hammer_btree_iterate(cursor);
1205                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1206                         if (error == 0)
1207                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1208                         else
1209                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1210                                                  HAMMER_CURSOR_ATEDISK;
1211                 }
1212         }
1213
1214 next_memory:
1215         /*
1216          * Get the next in-memory record.  The record can be ripped out
1217          * of the RB tree so we maintain a scan_info structure to track
1218          * the next node.
1219          *
1220          * hammer_rec_scan_cmp:  Is the record still in our general range,
1221          *                       (non-inclusive of snapshot exclusions)?
1222          * hammer_rec_scan_callback: Is the record in our snapshot?
1223          */
1224         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1225                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1226                         save = cursor->iprec;
1227                         cursor->iprec = NULL;
1228                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1229                         while (rec) {
1230                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1231                                         break;
1232                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1233                                         break;
1234                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1235                         }
1236                         if (save)
1237                                 hammer_rel_mem_record(save);
1238                         if (cursor->iprec) {
1239                                 KKASSERT(cursor->iprec == rec);
1240                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1241 #if 0
1242                                 cursor->scan.node =
1243                                         hammer_rec_rb_tree_RB_NEXT(rec);
1244 #endif
1245                         } else {
1246                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1247                         }
1248                 }
1249         }
1250
1251         /*
1252          * Extract either the disk or memory record depending on their
1253          * relative position.
1254          */
1255         error = 0;
1256         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1257         case 0:
1258                 /*
1259                  * Both entries valid
1260                  */
1261                 elm = &cursor->node->ondisk->elms[cursor->index];
1262                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1263                 if (r < 0) {
1264                         error = hammer_btree_extract(cursor,
1265                                                      HAMMER_CURSOR_GET_RECORD);
1266                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1267                         break;
1268                 }
1269
1270                 /*
1271                  * If the entries match the memory entry must specify
1272                  * an on-disk deletion.  Eat both entries unless the
1273                  * caller wants visibility into the special records.
1274                  */
1275                 if (r == 0) {
1276                         KKASSERT(cursor->iprec->flags & 
1277                                  HAMMER_RECF_DELETE_ONDISK);
1278                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1279                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1280                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1281                                 goto next_btree;
1282                         }
1283                 }
1284                 /* fall through to the memory entry */
1285         case HAMMER_CURSOR_ATEDISK:
1286                 /*
1287                  * Only the memory entry is valid.  If the record is
1288                  * placemarking an on-disk deletion, we skip it unless
1289                  * the caller wants special record visibility.
1290                  */
1291                 cursor->record = &cursor->iprec->rec;
1292                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1293                 if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
1294                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1295                                 goto next_memory;
1296                 }
1297                 break;
1298         case HAMMER_CURSOR_ATEMEM:
1299                 /*
1300                  * Only the disk entry is valid
1301                  */
1302                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1303                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1304                 break;
1305         default:
1306                 /*
1307                  * Neither entry is valid
1308                  *
1309                  * XXX error not set properly
1310                  */
1311                 cursor->record = NULL;
1312                 error = ENOENT;
1313                 break;
1314         }
1315         return(error);
1316 }
1317
1318 /*
1319  * Resolve the cursor->data pointer for the current cursor position in
1320  * a merged iteration.
1321  */
1322 int
1323 hammer_ip_resolve_data(hammer_cursor_t cursor)
1324 {
1325         int error;
1326
1327         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1328                 cursor->data = cursor->iprec->data;
1329                 error = 0;
1330         } else {
1331                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1332         }
1333         return(error);
1334 }
1335
1336 int
1337 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1338 {
1339         int error;
1340
1341         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1342                 cursor->data = cursor->iprec->data;
1343                 error = 0;
1344         } else {
1345                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1346                                                      HAMMER_CURSOR_GET_RECORD);
1347         }
1348         return(error);
1349 }
1350
1351 /*
1352  * Delete all records within the specified range for inode ip.
1353  *
1354  * NOTE: An unaligned range will cause new records to be added to cover
1355  * the edge cases. (XXX not implemented yet).
1356  *
1357  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1358  *
1359  * NOTE: Record keys for regular file data have to be special-cased since
1360  * they indicate the end of the range (key = base + bytes).
1361  */
1362 int
1363 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1364                        int64_t ran_beg, int64_t ran_end)
1365 {
1366         struct hammer_cursor cursor;
1367         hammer_record_ondisk_t rec;
1368         hammer_base_elm_t base;
1369         int error;
1370         int64_t off;
1371
1372 #if 0
1373         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1374 #endif
1375
1376         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1377 retry:
1378         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1379
1380         cursor.key_beg.obj_id = ip->obj_id;
1381         cursor.key_beg.create_tid = 0;
1382         cursor.key_beg.delete_tid = 0;
1383         cursor.key_beg.obj_type = 0;
1384         cursor.asof = ip->obj_asof;
1385         cursor.flags |= HAMMER_CURSOR_ASOF;
1386         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1387         cursor.flags |= HAMMER_CURSOR_BACKEND;
1388
1389         cursor.key_end = cursor.key_beg;
1390         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1391                 cursor.key_beg.key = ran_beg;
1392                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1393                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1394                 cursor.key_end.key = ran_end;
1395         } else {
1396                 /*
1397                  * The key in the B-Tree is (base+bytes), so the first possible
1398                  * matching key is ran_beg + 1.
1399                  */
1400                 int64_t tmp64;
1401
1402                 cursor.key_beg.key = ran_beg + 1;
1403                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1404                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1405
1406                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1407                 if (tmp64 < ran_end)
1408                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1409                 else
1410                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1411         }
1412         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1413
1414         error = hammer_ip_first(&cursor, ip);
1415
1416         /*
1417          * Iterate through matching records and mark them as deleted.
1418          */
1419         while (error == 0) {
1420                 rec = cursor.record;
1421                 base = &rec->base.base;
1422
1423                 KKASSERT(base->delete_tid == 0);
1424
1425                 /*
1426                  * There may be overlap cases for regular file data.  Also
1427                  * remember the key for a regular file record is the offset
1428                  * of the last byte of the record (base + len - 1), NOT the
1429                  * base offset.
1430                  */
1431 #if 0
1432                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1433 #endif
1434                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1435 #if 0
1436                         kprintf("delete_range loop key %016llx,%d\n",
1437                                 base->key - rec->base.data_len, rec->base.data_len);
1438 #endif
1439                         off = base->key - rec->base.data_len;
1440                         /*
1441                          * Check the left edge case.  We currently do not
1442                          * split existing records.
1443                          */
1444                         if (off < ran_beg) {
1445                                 panic("hammer left edge case %016llx %d\n",
1446                                         base->key, rec->base.data_len);
1447                         }
1448
1449                         /*
1450                          * Check the right edge case.  Note that the
1451                          * record can be completely out of bounds, which
1452                          * terminates the search.
1453                          *
1454                          * base->key is exclusive of the right edge while
1455                          * ran_end is inclusive of the right edge.  The
1456                          * (key - data_len) left boundary is inclusive.
1457                          *
1458                          * XXX theory-check this test at some point, are
1459                          * we missing a + 1 somewhere?  Note that ran_end
1460                          * could overflow.
1461                          */
1462                         if (base->key - 1 > ran_end) {
1463                                 if (base->key - rec->base.data_len > ran_end)
1464                                         break;
1465                                 panic("hammer right edge case\n");
1466                         }
1467                 }
1468
1469                 /*
1470                  * Mark the record and B-Tree entry as deleted.  This will
1471                  * also physically delete the B-Tree entry, record, and
1472                  * data if the retention policy dictates.  The function
1473                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1474                  * uses to perform a fixup.
1475                  */
1476                 error = hammer_ip_delete_record(&cursor, trans->tid);
1477                 if (error)
1478                         break;
1479                 error = hammer_ip_next(&cursor);
1480         }
1481         hammer_done_cursor(&cursor);
1482         if (error == EDEADLK)
1483                 goto retry;
1484         if (error == ENOENT)
1485                 error = 0;
1486         return(error);
1487 }
1488
1489 /*
1490  * Delete all records associated with an inode except the inode record
1491  * itself.
1492  */
1493 int
1494 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1495 {
1496         struct hammer_cursor cursor;
1497         hammer_record_ondisk_t rec;
1498         hammer_base_elm_t base;
1499         int error;
1500
1501         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1502 retry:
1503         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1504
1505         cursor.key_beg.obj_id = ip->obj_id;
1506         cursor.key_beg.create_tid = 0;
1507         cursor.key_beg.delete_tid = 0;
1508         cursor.key_beg.obj_type = 0;
1509         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1510         cursor.key_beg.key = HAMMER_MIN_KEY;
1511
1512         cursor.key_end = cursor.key_beg;
1513         cursor.key_end.rec_type = 0xFFFF;
1514         cursor.key_end.key = HAMMER_MAX_KEY;
1515
1516         cursor.asof = ip->obj_asof;
1517         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1518         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1519         cursor.flags |= HAMMER_CURSOR_BACKEND;
1520
1521         error = hammer_ip_first(&cursor, ip);
1522
1523         /*
1524          * Iterate through matching records and mark them as deleted.
1525          */
1526         while (error == 0) {
1527                 rec = cursor.record;
1528                 base = &rec->base.base;
1529
1530                 KKASSERT(base->delete_tid == 0);
1531
1532                 /*
1533                  * Mark the record and B-Tree entry as deleted.  This will
1534                  * also physically delete the B-Tree entry, record, and
1535                  * data if the retention policy dictates.  The function
1536                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1537                  * uses to perform a fixup.
1538                  */
1539                 error = hammer_ip_delete_record(&cursor, trans->tid);
1540                 if (error)
1541                         break;
1542                 error = hammer_ip_next(&cursor);
1543         }
1544         hammer_done_cursor(&cursor);
1545         if (error == EDEADLK)
1546                 goto retry;
1547         if (error == ENOENT)
1548                 error = 0;
1549         return(error);
1550 }
1551
1552 /*
1553  * Delete the record at the current cursor.  On success the cursor will
1554  * be positioned appropriately for an iteration but may no longer be at
1555  * a leaf node.
1556  *
1557  * This routine is only called from the backend.
1558  *
1559  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1560  * cursor and retry.
1561  */
1562 int
1563 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1564 {
1565         hammer_btree_elm_t elm;
1566         hammer_mount_t hmp;
1567         int error;
1568         int dodelete;
1569
1570         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1571
1572         /*
1573          * In-memory (unsynchronized) records can simply be freed.  This
1574          * only occurs in range iterations since all other records are
1575          * individually synchronized.  Thus there should be no confusion with
1576          * the interlock.
1577          *
1578          * 
1579          */
1580         if (cursor->record == &cursor->iprec->rec) {
1581                 KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1582                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
1583                 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1584                 hammer_cleardep_mem_record(cursor->iprec);
1585                 return(0);
1586         }
1587
1588         /*
1589          * On-disk records are marked as deleted by updating their delete_tid.
1590          * This does not effect their position in the B-Tree (which is based
1591          * on their create_tid).
1592          */
1593         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1594         elm = NULL;
1595         hmp = cursor->node->hmp;
1596
1597         dodelete = 0;
1598         if (error == 0) {
1599                 error = hammer_cursor_upgrade(cursor);
1600                 if (error == 0) {
1601                         elm = &cursor->node->ondisk->elms[cursor->index];
1602                         hammer_modify_node(cursor->trans, cursor->node,
1603                                            &elm->leaf.base.delete_tid,
1604                                            sizeof(elm->leaf.base.delete_tid));
1605                         elm->leaf.base.delete_tid = tid;
1606                         hammer_modify_node_done(cursor->node);
1607
1608                         /*
1609                          * An on-disk record cannot have the same delete_tid
1610                          * as its create_tid.  In a chain of record updates
1611                          * this could result in a duplicate record.
1612                          */
1613                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1614                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1615                         cursor->record->base.base.delete_tid = tid;
1616                         hammer_modify_buffer_done(cursor->record_buffer);
1617                 }
1618         }
1619
1620         /*
1621          * If we were mounted with the nohistory option, we physically
1622          * delete the record.
1623          */
1624         if (hmp->hflags & HMNT_NOHISTORY)
1625                 dodelete = 1;
1626
1627         if (error == 0 && dodelete) {
1628                 error = hammer_delete_at_cursor(cursor, NULL);
1629                 if (error) {
1630                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1631                         error = 0;
1632                 }
1633         }
1634         return(error);
1635 }
1636
1637 int
1638 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1639 {
1640         hammer_btree_elm_t elm;
1641         hammer_off_t rec_offset;
1642         hammer_off_t data_offset;
1643         int32_t data_len;
1644         u_int16_t rec_type;
1645         int error;
1646
1647         elm = &cursor->node->ondisk->elms[cursor->index];
1648         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1649
1650         rec_offset = elm->leaf.rec_offset;
1651         data_offset = elm->leaf.data_offset;
1652         data_len = elm->leaf.data_len;
1653         rec_type = elm->leaf.base.rec_type;
1654
1655         error = hammer_btree_delete(cursor);
1656         if (error == 0) {
1657                 /*
1658                  * This forces a fixup for the iteration because
1659                  * the cursor is now either sitting at the 'next'
1660                  * element or sitting at the end of a leaf.
1661                  */
1662                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1663                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1664                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1665                 }
1666         }
1667         if (error == 0) {
1668                 hammer_blockmap_free(cursor->trans, rec_offset,
1669                                      sizeof(union hammer_record_ondisk));
1670         }
1671         if (error == 0) {
1672                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1673                 case HAMMER_ZONE_LARGE_DATA:
1674                 case HAMMER_ZONE_SMALL_DATA:
1675                         hammer_blockmap_free(cursor->trans,
1676                                              data_offset, data_len);
1677                         break;
1678                 default:
1679                         break;
1680                 }
1681         }
1682 #if 0
1683         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1684                 "(%d remain in cluster)\n",
1685                 cluster->volume->vol_no, cluster->clu_no,
1686                 rec_offset, data_offset, data_len,
1687                 cluster->ondisk->stat_records);
1688 #endif
1689         return (error);
1690 }
1691
1692 /*
1693  * Determine whether a directory is empty or not.  Returns 0 if the directory
1694  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1695  */
1696 int
1697 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1698 {
1699         struct hammer_cursor cursor;
1700         int error;
1701
1702         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1703
1704         cursor.key_beg.obj_id = ip->obj_id;
1705         cursor.key_beg.create_tid = 0;
1706         cursor.key_beg.delete_tid = 0;
1707         cursor.key_beg.obj_type = 0;
1708         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1709         cursor.key_beg.key = HAMMER_MIN_KEY;
1710
1711         cursor.key_end = cursor.key_beg;
1712         cursor.key_end.rec_type = 0xFFFF;
1713         cursor.key_end.key = HAMMER_MAX_KEY;
1714
1715         cursor.asof = ip->obj_asof;
1716         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1717
1718         error = hammer_ip_first(&cursor, ip);
1719         if (error == ENOENT)
1720                 error = 0;
1721         else if (error == 0)
1722                 error = ENOTEMPTY;
1723         hammer_done_cursor(&cursor);
1724         return(error);
1725 }
1726