HAMMER 40C/Many: Inode/link-count sequencer cleanup pass.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.50 2008/05/02 16:41:26 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71
72         /*
73          * Never match against an item deleted by the front-end.
74          */
75         if (rec1->flags & HAMMER_RECF_DELETED_FE)
76                 return(1);
77         if (rec2->flags & HAMMER_RECF_DELETED_FE)
78                 return(-1);
79
80         return(0);
81 }
82
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86         if (info->rec_type < rec->rec.base.base.rec_type)
87                 return(-3);
88         if (info->rec_type > rec->rec.base.base.rec_type)
89                 return(3);
90
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95
96         if (info->create_tid == 0) {
97                 if (rec->rec.base.base.create_tid == 0)
98                         return(0);
99                 return(1);
100         }
101         if (rec->rec.base.base.create_tid == 0)
102                 return(-1);
103         if (info->create_tid < rec->rec.base.base.create_tid)
104                 return(-1);
105         if (info->create_tid > rec->rec.base.base.create_tid)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124         hammer_cursor_t cursor = data;
125         int r;
126
127         r = hammer_rec_compare(&cursor->key_beg, rec);
128         if (r > 1)
129                 return(-1);
130         r = hammer_rec_compare(&cursor->key_end, rec);
131         if (r < -1)
132                 return(1);
133         return(0);
134 }
135
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143         hammer_cursor_t cursor = data;
144         int r;
145
146         r = hammer_rec_compare(&cursor->key_beg, rec);
147         if (r > 1)
148                 return(-1);
149         if (r < -1)
150                 return(1);
151         return(0);
152 }
153
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156                     hammer_rec_compare, hammer_base_elm_t);
157
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165         hammer_record_t record;
166
167         ++hammer_count_records;
168         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169         record->flush_state = HAMMER_FST_IDLE;
170         record->ip = ip;
171         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172         hammer_ref(&record->lock);
173         return (record);
174 }
175
176 void
177 hammer_wait_mem_record(hammer_record_t record)
178 {
179         while (record->flush_state == HAMMER_FST_FLUSH) {
180                 record->flags |= HAMMER_RECF_WANTED;
181                 tsleep(record, 0, "hmrrc2", 0);
182         }
183 }
184
185 /*
186  * Called from the backend, hammer_inode.c, after a record has been
187  * flushed to disk.  The record has been exclusively locked by the
188  * caller and interlocked with BE.
189  *
190  * We clean up the state, unlock, and release the record (the record
191  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
192  */
193 void
194 hammer_flush_record_done(hammer_record_t record, int error)
195 {
196         hammer_inode_t target_ip;
197         int cleanup = 0;
198
199         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
200         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
201
202         if (error) {
203                 /*
204                  * An error occured, the backend was unable to sync the
205                  * record to its media.  Leave the record intact.
206                  */
207                 Debugger("flush_record_done error");
208         } else if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
209                 /*
210                  * deleted-record to delete-on-disk conversion, occurs when
211                  * we sync a record to disk which is marked deleted by the
212                  * frontend, but not deleted from the point of view of the
213                  * backend.
214                  */
215                 if (record->flags & HAMMER_RECF_DELETED_BE) {
216                         record->flags |= HAMMER_RECF_DELETED_FE;
217                         cleanup = 1;
218                 } else {
219                         KKASSERT(record->type == HAMMER_MEM_RECORD_DEL);
220                 }
221         } else {
222                 /*
223                  * Normal completion, record has been disposed of (by
224                  * having been synchronized to the media).
225                  */
226                 record->flags |= HAMMER_RECF_DELETED_FE;
227                 record->flags |= HAMMER_RECF_DELETED_BE;
228                 cleanup = 1;
229         }
230         if (cleanup) {
231                 if ((target_ip = record->target_ip) != NULL) {
232                         TAILQ_REMOVE(&target_ip->target_list, record,
233                                      target_entry);
234                         record->target_ip = NULL;
235                         hammer_test_inode(target_ip);
236                 }
237                 record->flush_state = HAMMER_FST_IDLE;
238         } else {
239                 if (record->target_ip)
240                         record->flush_state = HAMMER_FST_SETUP;
241                 else
242                         record->flush_state = HAMMER_FST_IDLE;
243         }
244
245         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
246         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
247         if (record->flags & HAMMER_RECF_WANTED) {
248                 record->flags &= ~HAMMER_RECF_WANTED;
249                 wakeup(record);
250         }
251         hammer_rel_mem_record(record);
252 }
253
254 /*
255  * Release a memory record.  Records marked for deletion are immediately
256  * removed from the RB-Tree but otherwise left intact until the last ref
257  * goes away.
258  */
259 void
260 hammer_rel_mem_record(struct hammer_record *record)
261 {
262         hammer_inode_t ip, target_ip;
263
264         hammer_unref(&record->lock);
265
266         if (record->flags & HAMMER_RECF_DELETED_FE) {
267                 if (record->lock.refs == 0) {
268                         KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
269
270                         ip = record->ip;
271                         if ((target_ip = record->target_ip) != NULL) {
272                                 TAILQ_REMOVE(&target_ip->target_list,
273                                              record, target_entry);
274                                 record->target_ip = NULL;
275                                 hammer_test_inode(target_ip);
276                         }
277
278                         if (record->flags & HAMMER_RECF_ONRBTREE) {
279                                 RB_REMOVE(hammer_rec_rb_tree,
280                                           &record->ip->rec_tree,
281                                           record);
282                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
283                                 if (RB_EMPTY(&record->ip->rec_tree)) {
284                                         record->ip->flags &= ~HAMMER_INODE_XDIRTY;
285                                         hammer_test_inode(record->ip);
286                                 }
287                         }
288                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
289                                 --hammer_count_record_datas;
290                                 kfree(record->data, M_HAMMER);
291                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
292                         }
293                         record->data = NULL;
294                         --hammer_count_records;
295                         kfree(record, M_HAMMER);
296                         return;
297                 }
298         }
299 }
300
301 /*
302  * Record visibility depends on whether the record is being accessed by
303  * the backend or the frontend.
304  *
305  * Return non-zero if the record is visible, zero if it isn't or if it is
306  * deleted.
307  */
308 static __inline
309 int
310 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
311 {
312         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
313                 if (record->flags & HAMMER_RECF_DELETED_BE)
314                         return(0);
315                 if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0)
316                         return(0);
317         } else {
318                 if (record->flags & HAMMER_RECF_DELETED_FE)
319                         return(0);
320         }
321         return(1);
322 }
323
324 /*
325  * This callback is used as part of the RB_SCAN function for in-memory
326  * records.  We terminate it (return -1) as soon as we get a match.
327  *
328  * This routine is used by frontend code.
329  *
330  * The primary compare code does not account for ASOF lookups.  This
331  * code handles that case as well as a few others.
332  */
333 static
334 int
335 hammer_rec_scan_callback(hammer_record_t rec, void *data)
336 {
337         hammer_cursor_t cursor = data;
338
339         /*
340          * We terminate on success, so this should be NULL on entry.
341          */
342         KKASSERT(cursor->iprec == NULL);
343
344         /*
345          * Skip if the record was marked deleted.
346          */
347         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
348                 return(0);
349
350         /*
351          * Skip if not visible due to our as-of TID
352          */
353         if (cursor->flags & HAMMER_CURSOR_ASOF) {
354                 if (cursor->asof < rec->rec.base.base.create_tid)
355                         return(0);
356                 if (rec->rec.base.base.delete_tid &&
357                     cursor->asof >= rec->rec.base.base.delete_tid) {
358                         return(0);
359                 }
360         }
361
362         /*
363          * If the record is queued to the flusher we have to block until
364          * it isn't.  Otherwise we may see duplication between our memory
365          * cache and the media.
366          */
367         hammer_ref(&rec->lock);
368
369 #warning "This deadlocks"
370 #if 0
371         if (rec->flush_state == HAMMER_FST_FLUSH)
372                 hammer_wait_mem_record(rec);
373 #endif
374
375         /*
376          * The record may have been deleted while we were blocked.
377          */
378         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
379                 hammer_rel_mem_record(rec);
380                 return(0);
381         }
382
383         /*
384          * Set the matching record and stop the scan.
385          */
386         cursor->iprec = rec;
387         return(-1);
388 }
389
390
391 /*
392  * Lookup an in-memory record given the key specified in the cursor.  Works
393  * just like hammer_btree_lookup() but operates on an inode's in-memory
394  * record list.
395  *
396  * The lookup must fail if the record is marked for deferred deletion.
397  */
398 static
399 int
400 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
401 {
402         int error;
403
404         if (cursor->iprec) {
405                 hammer_rel_mem_record(cursor->iprec);
406                 cursor->iprec = NULL;
407         }
408         if (cursor->ip) {
409                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
410                 --cursor->ip->cursor_ip_refs;
411 #if 0
412                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
413                                                   &cursor->ip->rec_tree);
414 #endif
415         }
416         cursor->ip = ip;
417 #if 0
418         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
419 #endif
420         ++ip->cursor_ip_refs;
421
422 #if 0
423         cursor->scan.node = NULL;
424 #endif
425         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
426                                    hammer_rec_scan_callback, cursor);
427
428         if (cursor->iprec == NULL)
429                 error = ENOENT;
430         else
431                 error = 0;
432         return(error);
433 }
434
435 /*
436  * hammer_mem_first() - locate the first in-memory record matching the
437  * cursor within the bounds of the key range.
438  */
439 static
440 int
441 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
442 {
443         if (cursor->iprec) {
444                 hammer_rel_mem_record(cursor->iprec);
445                 cursor->iprec = NULL;
446         }
447         if (cursor->ip) {
448                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
449                 --cursor->ip->cursor_ip_refs;
450 #if 0
451                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
452                                                   &cursor->ip->rec_tree);
453 #endif
454         }
455         cursor->ip = ip;
456 #if 0
457         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
458 #endif
459         ++ip->cursor_ip_refs;
460
461 #if 0
462         cursor->scan.node = NULL;
463 #endif
464         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
465                                    hammer_rec_scan_callback, cursor);
466
467         /*
468          * Adjust scan.node and keep it linked into the RB-tree so we can
469          * hold the cursor through third party modifications of the RB-tree.
470          */
471         if (cursor->iprec) {
472 #if 0
473                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
474 #endif
475                 return(0);
476         }
477         return(ENOENT);
478 }
479
480 void
481 hammer_mem_done(hammer_cursor_t cursor)
482 {
483         if (cursor->ip) {
484                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
485                 --cursor->ip->cursor_ip_refs;
486 #if 0
487                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
488                                                   &cursor->ip->rec_tree);
489 #endif
490                 cursor->ip = NULL;
491         }
492         if (cursor->iprec) {
493                 hammer_rel_mem_record(cursor->iprec);
494                 cursor->iprec = NULL;
495         }
496 }
497
498 /************************************************************************
499  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
500  ************************************************************************
501  *
502  * These functions manipulate in-memory records.  Such records typically
503  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
504  */
505
506 /*
507  * Add a directory entry (dip,ncp) which references inode (ip).
508  *
509  * Note that the low 32 bits of the namekey are set temporarily to create
510  * a unique in-memory record, and may be modified a second time when the
511  * record is synchronized to disk.  In particular, the low 32 bits cannot be
512  * all 0's when synching to disk, which is not handled here.
513  */
514 int
515 hammer_ip_add_directory(struct hammer_transaction *trans,
516                      struct hammer_inode *dip, struct namecache *ncp,
517                      struct hammer_inode *ip)
518 {
519         hammer_record_t record;
520         int error;
521         int bytes;
522
523         record = hammer_alloc_mem_record(dip);
524
525         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
526         if (++trans->hmp->namekey_iterator == 0)
527                 ++trans->hmp->namekey_iterator;
528
529         record->type = HAMMER_MEM_RECORD_ADD;
530         record->rec.entry.base.base.obj_id = dip->obj_id;
531         record->rec.entry.base.base.key =
532                 hammer_directory_namekey(ncp->nc_name, bytes);
533         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
534         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
535         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
536         record->rec.entry.obj_id = ip->obj_id;
537         record->data = (void *)ncp->nc_name;
538         record->rec.entry.base.data_len = bytes;
539         ++ip->ino_rec.ino_nlinks;
540         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
541
542         /*
543          * The target inode and the directory entry are bound together.
544          */
545         record->target_ip = ip;
546         record->flush_state = HAMMER_FST_SETUP;
547         TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
548
549         /*
550          * The inode now has a dependancy and must be taken out of the idle
551          * state.  An inode not in an idle state is given an extra reference.
552          */
553         if (ip->flush_state == HAMMER_FST_IDLE) {
554                 hammer_ref(&ip->lock);
555                 ip->flush_state = HAMMER_FST_SETUP;
556         }
557
558         /* NOTE: copies record->data */
559         error = hammer_mem_add(trans, record);
560         return(error);
561 }
562
563 /*
564  * Delete the directory entry and update the inode link count.  The
565  * cursor must be seeked to the directory entry record being deleted.
566  *
567  * The related inode should be share-locked by the caller.  The caller is
568  * on the frontend.
569  *
570  * This function can return EDEADLK requiring the caller to terminate
571  * the cursor, any locks, wait on the returned record, and retry.
572  */
573 int
574 hammer_ip_del_directory(struct hammer_transaction *trans,
575                      hammer_cursor_t cursor, struct hammer_inode *dip,
576                      struct hammer_inode *ip)
577 {
578         hammer_record_t record;
579         int error;
580
581         if (cursor->record == &cursor->iprec->rec) {
582                 /*
583                  * In-memory (unsynchronized) records can simply be freed.
584                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
585                  * by the backend, we must still avoid races against the
586                  * backend potentially syncing the record to the media. 
587                  *
588                  * We cannot call hammer_ip_delete_record(), that routine may
589                  * only be called from the backend.
590                  */
591                 record = cursor->iprec;
592                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
593                         KKASSERT(cursor->deadlk_rec == NULL);
594                         hammer_ref(&record->lock);
595                         cursor->deadlk_rec = record;
596                         error = EDEADLK;
597                 } else {
598                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
599                         record->flags |= HAMMER_RECF_DELETED_FE;
600                         error = 0;
601                 }
602         } else {
603                 /*
604                  * If the record is on-disk we have to queue the deletion by
605                  * the record's key.  This also causes lookups to skip the
606                  * record.
607                  */
608                 record = hammer_alloc_mem_record(dip);
609                 record->type = HAMMER_MEM_RECORD_DEL;
610                 record->rec.entry.base.base = cursor->record->base.base;
611                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
612
613                 record->target_ip = ip;
614                 record->flush_state = HAMMER_FST_SETUP;
615                 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
616
617                 /*
618                  * The inode now has a dependancy and must be taken out of
619                  * the idle state.  An inode not in an idle state is given
620                  * an extra reference.
621                  */
622                 if (ip->flush_state == HAMMER_FST_IDLE) {
623                         hammer_ref(&ip->lock);
624                         ip->flush_state = HAMMER_FST_SETUP;
625                 }
626
627                 error = hammer_mem_add(trans, record);
628         }
629
630         /*
631          * One less link.  The file may still be open in the OS even after
632          * all links have gone away so we only try to sync if the OS has
633          * no references and nlinks falls to 0.
634          *
635          * We have to terminate the cursor before syncing the inode to
636          * avoid deadlocking against ourselves.
637          *
638          * XXX we can't sync the inode here because the encompassing
639          * transaction might be a rename and might update the inode
640          * again with a new link.  That would force the delete_tid to be
641          * the same as the create_tid and cause a panic.
642          */
643         if (error == 0) {
644                 --ip->ino_rec.ino_nlinks;
645                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
646                 if (ip->ino_rec.ino_nlinks == 0 &&
647                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
648                         hammer_done_cursor(cursor);
649                 }
650
651         }
652         return(error);
653 }
654
655 /*
656  * Add a record to an inode.
657  *
658  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
659  * initialize the following additional fields:
660  *
661  * The related inode should be share-locked by the caller.  The caller is
662  * on the frontend.
663  *
664  * record->rec.entry.base.base.key
665  * record->rec.entry.base.base.rec_type
666  * record->rec.entry.base.base.data_len
667  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
668  */
669 int
670 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
671 {
672         hammer_inode_t ip = record->ip;
673         int error;
674
675         record->rec.base.base.obj_id = ip->obj_id;
676         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
677
678         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
679
680         /* NOTE: copies record->data */
681         error = hammer_mem_add(trans, record);
682         return(error);
683 }
684
685 /*
686  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
687  * is called via the strategy called from a cached data source.  This code
688  * is responsible for actually writing a data record out to the disk.
689  *
690  * This can only occur non-historically (i.e. 'current' data only).
691  *
692  * The file offset must be HAMMER_BUFSIZE aligned but the data length
693  * can be truncated.  The record (currently) always represents a BUFSIZE
694  * swath of space whether the data is truncated or not.
695  */
696 int
697 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
698                        int64_t offset, void *data, int bytes)
699 {
700         struct hammer_cursor cursor;
701         hammer_record_ondisk_t rec;
702         union hammer_btree_elm elm;
703         hammer_off_t rec_offset;
704         void *bdata;
705         int error;
706
707         KKASSERT((offset & HAMMER_BUFMASK) == 0);
708         KKASSERT(trans->type == HAMMER_TRANS_FLS);
709 retry:
710         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
711         if (error)
712                 return(error);
713         cursor.key_beg.obj_id = ip->obj_id;
714         cursor.key_beg.key = offset + bytes;
715         cursor.key_beg.create_tid = trans->tid;
716         cursor.key_beg.delete_tid = 0;
717         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
718         cursor.asof = trans->tid;
719         cursor.flags |= HAMMER_CURSOR_INSERT;
720         cursor.flags |= HAMMER_CURSOR_BACKEND;
721
722         /*
723          * Issue a lookup to position the cursor.
724          */
725         error = hammer_btree_lookup(&cursor);
726         if (error == 0) {
727                 kprintf("hammer_ip_sync_data: duplicate data at "
728                         "(%lld,%d) tid %016llx\n",
729                         offset, bytes, trans->tid);
730                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
731                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
732                 panic("Duplicate data");
733                 error = EIO;
734         }
735         if (error != ENOENT)
736                 goto done;
737
738         /*
739          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
740          * can cross buffer boundaries so we may have to split our bcopy.
741          */
742         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
743                                   &cursor.record_buffer,
744                                   bytes, &bdata,
745                                   &cursor.data_buffer, &error);
746         if (rec == NULL)
747                 goto done;
748         if (hammer_debug_general & 0x1000)
749                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
750
751         /*
752          * Fill everything in and insert our B-Tree node.
753          *
754          * NOTE: hammer_alloc_record() has already marked the related
755          * buffers as modified.  If we do it again we will generate
756          * unnecessary undo elements.
757          */
758         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
759         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
760         rec->base.base.obj_id = ip->obj_id;
761         rec->base.base.key = offset + bytes;
762         rec->base.base.create_tid = trans->tid;
763         rec->base.base.delete_tid = 0;
764         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
765         rec->base.data_crc = crc32(data, bytes);
766         hammer_modify_buffer_done(cursor.record_buffer);
767         KKASSERT(rec->base.data_len == bytes);
768
769         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
770         bcopy(data, bdata, bytes);
771         hammer_modify_buffer_done(cursor.data_buffer);
772
773         elm.leaf.base = rec->base.base;
774         elm.leaf.rec_offset = rec_offset;
775         elm.leaf.data_offset = rec->base.data_off;
776         elm.leaf.data_len = bytes;
777         elm.leaf.data_crc = rec->base.data_crc;
778
779         /*
780          * Data records can wind up on-disk before the inode itself is
781          * on-disk.  One must assume data records may be on-disk if either
782          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
783          */
784         ip->flags |= HAMMER_INODE_DONDISK;
785
786         error = hammer_btree_insert(&cursor, &elm);
787         if (error == 0)
788                 goto done;
789
790         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
791 done:
792         hammer_done_cursor(&cursor);
793         if (error == EDEADLK)
794                 goto retry;
795         return(error);
796 }
797
798 /*
799  * Sync an in-memory record to the disk.  This is called by the backend.
800  * This code is responsible for actually writing a record out to the disk.
801  *
802  * This routine can only be called by the backend and the record
803  * must have been interlocked with BE.  It will remain interlocked on
804  * return.  The caller is responsible for the record's disposition.
805  */
806 int
807 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
808 {
809         struct hammer_cursor cursor;
810         hammer_record_ondisk_t rec;
811         union hammer_btree_elm elm;
812         hammer_off_t rec_offset;
813         void *bdata;
814         int error;
815
816         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
817         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
818
819 retry:
820         /*
821          * Get a cursor, we will either be inserting or deleting.
822          */
823         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
824         if (error)
825                 return(error);
826         cursor.key_beg = record->rec.base.base;
827         cursor.flags |= HAMMER_CURSOR_BACKEND;
828
829         /*
830          * If we are deleting an exact match must be found on-disk.
831          */
832         if (record->type == HAMMER_MEM_RECORD_DEL) {
833                 error = hammer_btree_lookup(&cursor);
834                 if (error == 0)
835                         error = hammer_ip_delete_record(&cursor, trans->tid);
836                 goto done;
837         }
838
839         /*
840          * We are inserting.
841          *
842          * Issue a lookup to position the cursor and locate the cluster.  The
843          * target key should not exist.  If we are creating a directory entry
844          * we may have to iterate the low 32 bits of the key to find an unused
845          * key.
846          */
847         cursor.flags |= HAMMER_CURSOR_INSERT;
848
849         for (;;) {
850                 error = hammer_btree_lookup(&cursor);
851                 if (error)
852                         break;
853                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
854                         kprintf("hammer_ip_sync_record: duplicate rec "
855                                 "at (%016llx)\n", record->rec.base.base.key);
856                         Debugger("duplicate record1");
857                         error = EIO;
858                         break;
859                 }
860                 if (++trans->hmp->namekey_iterator == 0)
861                         ++trans->hmp->namekey_iterator;
862                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
863                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
864                 cursor.key_beg.key = record->rec.base.base.key;
865         }
866         if (error != ENOENT)
867                 goto done;
868
869         /*
870          * Allocate the record and data.  The result buffers will be
871          * marked as being modified and further calls to
872          * hammer_modify_buffer() will result in unneeded UNDO records.
873          *
874          * Support zero-fill records (data == NULL and data_len != 0)
875          */
876         if (record->data == NULL) {
877                 rec = hammer_alloc_record(trans, &rec_offset,
878                                           record->rec.base.base.rec_type,
879                                           &cursor.record_buffer,
880                                           0, &bdata,
881                                           NULL, &error);
882                 if (hammer_debug_general & 0x1000)
883                         kprintf("NULL RECORD DATA\n");
884         } else if (record->flags & HAMMER_RECF_INBAND) {
885                 rec = hammer_alloc_record(trans, &rec_offset,
886                                           record->rec.base.base.rec_type,
887                                           &cursor.record_buffer,
888                                           record->rec.base.data_len, &bdata,
889                                           NULL, &error);
890                 if (hammer_debug_general & 0x1000)
891                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
892         } else {
893                 rec = hammer_alloc_record(trans, &rec_offset,
894                                           record->rec.base.base.rec_type,
895                                           &cursor.record_buffer,
896                                           record->rec.base.data_len, &bdata,
897                                           &cursor.data_buffer, &error);
898                 if (hammer_debug_general & 0x1000)
899                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
900         }
901
902         if (rec == NULL)
903                 goto done;
904
905         /*
906          * Fill in the remaining fields and insert our B-Tree node.
907          */
908         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
909         rec->base.base = record->rec.base.base;
910         bcopy(&record->rec.base + 1, &rec->base + 1,
911               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
912
913         /*
914          * Copy the data and deal with zero-fill support.
915          */
916         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
917                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
918                 bcopy(record->data, bdata, rec->base.data_len);
919         } else if (record->data) {
920                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
921                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
922                 bcopy(record->data, bdata, rec->base.data_len);
923                 hammer_modify_buffer_done(cursor.data_buffer);
924         } else {
925                 rec->base.data_len = record->rec.base.data_len;
926         }
927         hammer_modify_buffer_done(cursor.record_buffer);
928
929         elm.leaf.base = record->rec.base.base;
930         elm.leaf.rec_offset = rec_offset;
931         elm.leaf.data_offset = rec->base.data_off;
932         elm.leaf.data_len = rec->base.data_len;
933         elm.leaf.data_crc = rec->base.data_crc;
934
935         error = hammer_btree_insert(&cursor, &elm);
936
937         /*
938          * This occurs when the frontend creates a record and queues it to
939          * the backend, then tries to delete the record.  The backend must
940          * still sync the record to the media as if it were not deleted,
941          * but must interlock with the frontend to ensure that the 
942          * synchronized record is not visible to the frontend, which means
943          * converting it from an ADD record to a DEL record.
944          *
945          * The DEL record then masks the record synced to disk until another
946          * round can delete it for real.
947          */
948         if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) {
949                 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
950                 record->flags &= ~HAMMER_RECF_DELETED_FE;
951                 record->type = HAMMER_MEM_RECORD_DEL;
952                 if (record->flush_state == HAMMER_FST_SETUP) {
953                         hammer_test_inode(record->ip);
954                         hammer_test_inode(record->target_ip);
955                 }
956         }
957
958         /*
959          * If the error occured unwind the operation.
960          */
961         if (error)
962                 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
963
964 done:
965         hammer_done_cursor(&cursor);
966         if (error == EDEADLK)
967                 goto retry;
968         return(error);
969 }
970
971 /*
972  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
973  * entry's key is used to deal with hash collisions in the upper 32 bits.
974  * A unique 64 bit key is generated in-memory and may be regenerated a
975  * second time when the directory record is flushed to the on-disk B-Tree.
976  *
977  * A referenced record is passed to this function.  This function
978  * eats the reference.  If an error occurs the record will be deleted.
979  *
980  * A copy of the temporary record->data pointer provided by the caller
981  * will be made.
982  */
983 static
984 int
985 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
986 {
987         void *data;
988         int bytes;
989         int reclen;
990                 
991         /*
992          * Make a private copy of record->data
993          */
994         if (record->data) {
995                 /*
996                  * Try to embed the data in extra space in the record
997                  * union, otherwise allocate a copy.
998                  */
999                 bytes = record->rec.base.data_len;
1000                 switch(record->rec.base.base.rec_type) {
1001                 case HAMMER_RECTYPE_DIRENTRY:
1002                         reclen = offsetof(struct hammer_entry_record, name[0]);
1003                         break;
1004                 case HAMMER_RECTYPE_DATA:
1005                         reclen = offsetof(struct hammer_data_record, data[0]);
1006                         break;
1007                 default:
1008                         reclen = sizeof(record->rec);
1009                         break;
1010                 }
1011                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
1012                         bcopy(record->data, (char *)&record->rec + reclen,
1013                               bytes);
1014                         record->data = (void *)((char *)&record->rec + reclen);
1015                         record->flags |= HAMMER_RECF_INBAND;
1016                 } else {
1017                         ++hammer_count_record_datas;
1018                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
1019                         record->flags |= HAMMER_RECF_ALLOCDATA;
1020                         bcopy(record->data, data, bytes);
1021                         record->data = data;
1022                 }
1023         }
1024
1025         /*
1026          * Insert into the RB tree, find an unused iterator if this is
1027          * a directory entry.
1028          */
1029         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1030                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
1031                         record->flags |= HAMMER_RECF_DELETED_FE;
1032                         hammer_rel_mem_record(record);
1033                         return (EEXIST);
1034                 }
1035                 if (++trans->hmp->namekey_iterator == 0)
1036                         ++trans->hmp->namekey_iterator;
1037                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
1038                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
1039         }
1040         record->flags |= HAMMER_RECF_ONRBTREE;
1041         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
1042         hammer_rel_mem_record(record);
1043         return(0);
1044 }
1045
1046 /************************************************************************
1047  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1048  ************************************************************************
1049  *
1050  * These functions augment the B-Tree scanning functions in hammer_btree.c
1051  * by merging in-memory records with on-disk records.
1052  */
1053
1054 /*
1055  * Locate a particular record either in-memory or on-disk.
1056  *
1057  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1058  * NOT be called to iterate results.
1059  */
1060 int
1061 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1062 {
1063         int error;
1064
1065         /*
1066          * If the element is in-memory return it without searching the
1067          * on-disk B-Tree
1068          */
1069         error = hammer_mem_lookup(cursor, ip);
1070         if (error == 0) {
1071                 cursor->record = &cursor->iprec->rec;
1072                 return(error);
1073         }
1074         if (error != ENOENT)
1075                 return(error);
1076
1077         /*
1078          * If the inode has on-disk components search the on-disk B-Tree.
1079          */
1080         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1081                 return(error);
1082         error = hammer_btree_lookup(cursor);
1083         if (error == 0)
1084                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1085         return(error);
1086 }
1087
1088 /*
1089  * Locate the first record within the cursor's key_beg/key_end range,
1090  * restricted to a particular inode.  0 is returned on success, ENOENT
1091  * if no records matched the requested range, or some other error.
1092  *
1093  * When 0 is returned hammer_ip_next() may be used to iterate additional
1094  * records within the requested range.
1095  *
1096  * This function can return EDEADLK, requiring the caller to terminate
1097  * the cursor and try again.
1098  */
1099 int
1100 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
1101 {
1102         int error;
1103
1104         /*
1105          * Clean up fields and setup for merged scan
1106          */
1107         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1108         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1109         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1110         if (cursor->iprec) {
1111                 hammer_rel_mem_record(cursor->iprec);
1112                 cursor->iprec = NULL;
1113         }
1114
1115         /*
1116          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1117          * exact lookup so if we get ENOENT we have to call the iterate
1118          * function to validate the first record after the begin key.
1119          *
1120          * The ATEDISK flag is used by hammer_btree_iterate to determine
1121          * whether it must index forwards or not.  It is also used here
1122          * to select the next record from in-memory or on-disk.
1123          *
1124          * EDEADLK can only occur if the lookup hit an empty internal
1125          * element and couldn't delete it.  Since this could only occur
1126          * in-range, we can just iterate from the failure point.
1127          */
1128         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1129                 error = hammer_btree_lookup(cursor);
1130                 if (error == ENOENT || error == EDEADLK) {
1131                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1132                         if (hammer_debug_general & 0x2000)
1133                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1134                         error = hammer_btree_iterate(cursor);
1135                 }
1136                 if (error && error != ENOENT) 
1137                         return(error);
1138                 if (error == 0) {
1139                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1140                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1141                 } else {
1142                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1143                 }
1144         }
1145
1146         /*
1147          * Search the in-memory record list (Red-Black tree).  Unlike the
1148          * B-Tree search, mem_first checks for records in the range.
1149          */
1150         error = hammer_mem_first(cursor, ip);
1151         if (error && error != ENOENT)
1152                 return(error);
1153         if (error == 0) {
1154                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1155                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1156                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1157                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1158         }
1159
1160         /*
1161          * This will return the first matching record.
1162          */
1163         return(hammer_ip_next(cursor));
1164 }
1165
1166 /*
1167  * Retrieve the next record in a merged iteration within the bounds of the
1168  * cursor.  This call may be made multiple times after the cursor has been
1169  * initially searched with hammer_ip_first().
1170  *
1171  * 0 is returned on success, ENOENT if no further records match the
1172  * requested range, or some other error code is returned.
1173  */
1174 int
1175 hammer_ip_next(hammer_cursor_t cursor)
1176 {
1177         hammer_btree_elm_t elm;
1178         hammer_record_t rec, save;
1179         int error;
1180         int r;
1181
1182 next_btree:
1183         /*
1184          * Load the current on-disk and in-memory record.  If we ate any
1185          * records we have to get the next one. 
1186          *
1187          * If we deleted the last on-disk record we had scanned ATEDISK will
1188          * be clear and DELBTREE will be set, forcing a call to iterate. The
1189          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1190          * element.  If ATEDISK is set, iterate will skip the 'current'
1191          * element.
1192          *
1193          * Get the next on-disk record
1194          */
1195         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1196                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1197                         error = hammer_btree_iterate(cursor);
1198                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1199                         if (error == 0)
1200                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1201                         else
1202                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1203                                                  HAMMER_CURSOR_ATEDISK;
1204                 }
1205         }
1206
1207 next_memory:
1208         /*
1209          * Get the next in-memory record.  The record can be ripped out
1210          * of the RB tree so we maintain a scan_info structure to track
1211          * the next node.
1212          *
1213          * hammer_rec_scan_cmp:  Is the record still in our general range,
1214          *                       (non-inclusive of snapshot exclusions)?
1215          * hammer_rec_scan_callback: Is the record in our snapshot?
1216          */
1217         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1218                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1219                         save = cursor->iprec;
1220                         cursor->iprec = NULL;
1221                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1222                         while (rec) {
1223                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1224                                         break;
1225                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1226                                         break;
1227                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1228                         }
1229                         if (save)
1230                                 hammer_rel_mem_record(save);
1231                         if (cursor->iprec) {
1232                                 KKASSERT(cursor->iprec == rec);
1233                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1234 #if 0
1235                                 cursor->scan.node =
1236                                         hammer_rec_rb_tree_RB_NEXT(rec);
1237 #endif
1238                         } else {
1239                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1240                         }
1241                 }
1242         }
1243
1244         /*
1245          * Extract either the disk or memory record depending on their
1246          * relative position.
1247          */
1248         error = 0;
1249         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1250         case 0:
1251                 /*
1252                  * Both entries valid
1253                  */
1254                 elm = &cursor->node->ondisk->elms[cursor->index];
1255                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1256                 if (r < 0) {
1257                         error = hammer_btree_extract(cursor,
1258                                                      HAMMER_CURSOR_GET_RECORD);
1259                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1260                         break;
1261                 }
1262
1263                 /*
1264                  * If the entries match exactly the memory entry typically
1265                  * specifies an on-disk deletion and we eat both entries.
1266                  *
1267                  * If the in-memory record is not an on-disk deletion we
1268                  * probably caught the syncer while it was syncing it to
1269                  * the media.  Since we hold a shared lock on the cursor,
1270                  * the in-memory record had better be marked deleted at
1271                  * this point.
1272                  */
1273                 if (r == 0) {
1274                         if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1275                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1276                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1277                                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1278                                         goto next_btree;
1279                                 }
1280                         } else {
1281                                 KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0);
1282                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1283                                 goto next_memory;
1284                         }
1285                 }
1286                 /* fall through to the memory entry */
1287         case HAMMER_CURSOR_ATEDISK:
1288                 /*
1289                  * Only the memory entry is valid.  If the record is
1290                  * placemarking an on-disk deletion, we skip it unless
1291                  * the caller wants special record visibility.
1292                  */
1293                 cursor->record = &cursor->iprec->rec;
1294                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1295                 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1296                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1297                                 goto next_memory;
1298                 }
1299                 break;
1300         case HAMMER_CURSOR_ATEMEM:
1301                 /*
1302                  * Only the disk entry is valid
1303                  */
1304                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1305                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1306                 break;
1307         default:
1308                 /*
1309                  * Neither entry is valid
1310                  *
1311                  * XXX error not set properly
1312                  */
1313                 cursor->record = NULL;
1314                 error = ENOENT;
1315                 break;
1316         }
1317         return(error);
1318 }
1319
1320 /*
1321  * Resolve the cursor->data pointer for the current cursor position in
1322  * a merged iteration.
1323  */
1324 int
1325 hammer_ip_resolve_data(hammer_cursor_t cursor)
1326 {
1327         int error;
1328
1329         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1330                 cursor->data = cursor->iprec->data;
1331                 error = 0;
1332         } else {
1333                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1334         }
1335         return(error);
1336 }
1337
1338 int
1339 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1340 {
1341         int error;
1342
1343         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1344                 cursor->data = cursor->iprec->data;
1345                 error = 0;
1346         } else {
1347                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1348                                                      HAMMER_CURSOR_GET_RECORD);
1349         }
1350         return(error);
1351 }
1352
1353 /*
1354  * Delete all records within the specified range for inode ip.
1355  *
1356  * NOTE: An unaligned range will cause new records to be added to cover
1357  * the edge cases. (XXX not implemented yet).
1358  *
1359  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1360  *
1361  * NOTE: Record keys for regular file data have to be special-cased since
1362  * they indicate the end of the range (key = base + bytes).
1363  */
1364 int
1365 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1366                        int64_t ran_beg, int64_t ran_end)
1367 {
1368         struct hammer_cursor cursor;
1369         hammer_record_ondisk_t rec;
1370         hammer_base_elm_t base;
1371         int error;
1372         int64_t off;
1373
1374 #if 0
1375         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1376 #endif
1377
1378         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1379 retry:
1380         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1381
1382         cursor.key_beg.obj_id = ip->obj_id;
1383         cursor.key_beg.create_tid = 0;
1384         cursor.key_beg.delete_tid = 0;
1385         cursor.key_beg.obj_type = 0;
1386         cursor.asof = ip->obj_asof;
1387         cursor.flags |= HAMMER_CURSOR_ASOF;
1388         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1389         cursor.flags |= HAMMER_CURSOR_BACKEND;
1390
1391         cursor.key_end = cursor.key_beg;
1392         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1393                 cursor.key_beg.key = ran_beg;
1394                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1395                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1396                 cursor.key_end.key = ran_end;
1397         } else {
1398                 /*
1399                  * The key in the B-Tree is (base+bytes), so the first possible
1400                  * matching key is ran_beg + 1.
1401                  */
1402                 int64_t tmp64;
1403
1404                 cursor.key_beg.key = ran_beg + 1;
1405                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1406                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1407
1408                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1409                 if (tmp64 < ran_end)
1410                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1411                 else
1412                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1413         }
1414         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1415
1416         error = hammer_ip_first(&cursor, ip);
1417
1418         /*
1419          * Iterate through matching records and mark them as deleted.
1420          */
1421         while (error == 0) {
1422                 rec = cursor.record;
1423                 base = &rec->base.base;
1424
1425                 KKASSERT(base->delete_tid == 0);
1426
1427                 /*
1428                  * There may be overlap cases for regular file data.  Also
1429                  * remember the key for a regular file record is the offset
1430                  * of the last byte of the record (base + len - 1), NOT the
1431                  * base offset.
1432                  */
1433 #if 0
1434                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1435 #endif
1436                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1437 #if 0
1438                         kprintf("delete_range loop key %016llx,%d\n",
1439                                 base->key - rec->base.data_len, rec->base.data_len);
1440 #endif
1441                         off = base->key - rec->base.data_len;
1442                         /*
1443                          * Check the left edge case.  We currently do not
1444                          * split existing records.
1445                          */
1446                         if (off < ran_beg) {
1447                                 panic("hammer left edge case %016llx %d\n",
1448                                         base->key, rec->base.data_len);
1449                         }
1450
1451                         /*
1452                          * Check the right edge case.  Note that the
1453                          * record can be completely out of bounds, which
1454                          * terminates the search.
1455                          *
1456                          * base->key is exclusive of the right edge while
1457                          * ran_end is inclusive of the right edge.  The
1458                          * (key - data_len) left boundary is inclusive.
1459                          *
1460                          * XXX theory-check this test at some point, are
1461                          * we missing a + 1 somewhere?  Note that ran_end
1462                          * could overflow.
1463                          */
1464                         if (base->key - 1 > ran_end) {
1465                                 if (base->key - rec->base.data_len > ran_end)
1466                                         break;
1467                                 panic("hammer right edge case\n");
1468                         }
1469                 }
1470
1471                 /*
1472                  * Mark the record and B-Tree entry as deleted.  This will
1473                  * also physically delete the B-Tree entry, record, and
1474                  * data if the retention policy dictates.  The function
1475                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1476                  * uses to perform a fixup.
1477                  */
1478                 error = hammer_ip_delete_record(&cursor, trans->tid);
1479                 if (error)
1480                         break;
1481                 error = hammer_ip_next(&cursor);
1482         }
1483         hammer_done_cursor(&cursor);
1484         if (error == EDEADLK)
1485                 goto retry;
1486         if (error == ENOENT)
1487                 error = 0;
1488         return(error);
1489 }
1490
1491 /*
1492  * Delete all user records associated with an inode except the inode record
1493  * itself.  Directory entries are not deleted (they must be properly disposed
1494  * of or nlinks would get upset).
1495  */
1496 int
1497 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip,
1498                            int *countp)
1499 {
1500         struct hammer_cursor cursor;
1501         hammer_record_ondisk_t rec;
1502         hammer_base_elm_t base;
1503         int error;
1504
1505         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1506 retry:
1507         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1508
1509         cursor.key_beg.obj_id = ip->obj_id;
1510         cursor.key_beg.create_tid = 0;
1511         cursor.key_beg.delete_tid = 0;
1512         cursor.key_beg.obj_type = 0;
1513         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1514         cursor.key_beg.key = HAMMER_MIN_KEY;
1515
1516         cursor.key_end = cursor.key_beg;
1517         cursor.key_end.rec_type = 0xFFFF;
1518         cursor.key_end.key = HAMMER_MAX_KEY;
1519
1520         cursor.asof = ip->obj_asof;
1521         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1522         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1523         cursor.flags |= HAMMER_CURSOR_BACKEND;
1524
1525         error = hammer_ip_first(&cursor, ip);
1526
1527         /*
1528          * Iterate through matching records and mark them as deleted.
1529          */
1530         while (error == 0) {
1531                 rec = cursor.record;
1532                 base = &rec->base.base;
1533
1534                 KKASSERT(base->delete_tid == 0);
1535
1536                 /*
1537                  * Mark the record and B-Tree entry as deleted.  This will
1538                  * also physically delete the B-Tree entry, record, and
1539                  * data if the retention policy dictates.  The function
1540                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1541                  * uses to perform a fixup.
1542                  *
1543                  * Directory entries (and delete-on-disk directory entries)
1544                  * must be synced and cannot be deleted.
1545                  */
1546                 if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
1547                         error = hammer_ip_delete_record(&cursor, trans->tid);
1548                         ++*countp;
1549                 }
1550                 if (error)
1551                         break;
1552                 error = hammer_ip_next(&cursor);
1553         }
1554         hammer_done_cursor(&cursor);
1555         if (error == EDEADLK)
1556                 goto retry;
1557         if (error == ENOENT)
1558                 error = 0;
1559         return(error);
1560 }
1561
1562 /*
1563  * Delete the record at the current cursor.  On success the cursor will
1564  * be positioned appropriately for an iteration but may no longer be at
1565  * a leaf node.
1566  *
1567  * This routine is only called from the backend.
1568  *
1569  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1570  * cursor and retry.
1571  */
1572 int
1573 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1574 {
1575         hammer_btree_elm_t elm;
1576         hammer_mount_t hmp;
1577         int error;
1578         int dodelete;
1579
1580         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1581
1582         /*
1583          * In-memory (unsynchronized) records can simply be freed.  This
1584          * only occurs in range iterations since all other records are
1585          * individually synchronized.  Thus there should be no confusion with
1586          * the interlock.
1587          */
1588         if (cursor->record == &cursor->iprec->rec) {
1589                 KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1590                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
1591                 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1592                 return(0);
1593         }
1594
1595         /*
1596          * On-disk records are marked as deleted by updating their delete_tid.
1597          * This does not effect their position in the B-Tree (which is based
1598          * on their create_tid).
1599          */
1600         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1601         elm = NULL;
1602         hmp = cursor->node->hmp;
1603
1604         dodelete = 0;
1605         if (error == 0) {
1606                 error = hammer_cursor_upgrade(cursor);
1607                 if (error == 0) {
1608                         elm = &cursor->node->ondisk->elms[cursor->index];
1609                         hammer_modify_node(cursor->trans, cursor->node,
1610                                            &elm->leaf.base.delete_tid,
1611                                            sizeof(elm->leaf.base.delete_tid));
1612                         elm->leaf.base.delete_tid = tid;
1613                         hammer_modify_node_done(cursor->node);
1614
1615                         /*
1616                          * An on-disk record cannot have the same delete_tid
1617                          * as its create_tid.  In a chain of record updates
1618                          * this could result in a duplicate record.
1619                          */
1620                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1621                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1622                         cursor->record->base.base.delete_tid = tid;
1623                         hammer_modify_buffer_done(cursor->record_buffer);
1624                 }
1625         }
1626
1627         /*
1628          * If we were mounted with the nohistory option, we physically
1629          * delete the record.
1630          */
1631         if (hmp->hflags & HMNT_NOHISTORY)
1632                 dodelete = 1;
1633
1634         if (error == 0 && dodelete) {
1635                 error = hammer_delete_at_cursor(cursor, NULL);
1636                 if (error) {
1637                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1638                         error = 0;
1639                 }
1640         }
1641         return(error);
1642 }
1643
1644 int
1645 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1646 {
1647         hammer_btree_elm_t elm;
1648         hammer_off_t rec_offset;
1649         hammer_off_t data_offset;
1650         int32_t data_len;
1651         u_int16_t rec_type;
1652         int error;
1653
1654         elm = &cursor->node->ondisk->elms[cursor->index];
1655         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1656
1657         rec_offset = elm->leaf.rec_offset;
1658         data_offset = elm->leaf.data_offset;
1659         data_len = elm->leaf.data_len;
1660         rec_type = elm->leaf.base.rec_type;
1661
1662         error = hammer_btree_delete(cursor);
1663         if (error == 0) {
1664                 /*
1665                  * This forces a fixup for the iteration because
1666                  * the cursor is now either sitting at the 'next'
1667                  * element or sitting at the end of a leaf.
1668                  */
1669                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1670                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1671                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1672                 }
1673         }
1674         if (error == 0) {
1675                 hammer_blockmap_free(cursor->trans, rec_offset,
1676                                      sizeof(union hammer_record_ondisk));
1677         }
1678         if (error == 0) {
1679                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1680                 case HAMMER_ZONE_LARGE_DATA:
1681                 case HAMMER_ZONE_SMALL_DATA:
1682                         hammer_blockmap_free(cursor->trans,
1683                                              data_offset, data_len);
1684                         break;
1685                 default:
1686                         break;
1687                 }
1688         }
1689 #if 0
1690         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1691                 "(%d remain in cluster)\n",
1692                 cluster->volume->vol_no, cluster->clu_no,
1693                 rec_offset, data_offset, data_len,
1694                 cluster->ondisk->stat_records);
1695 #endif
1696         return (error);
1697 }
1698
1699 /*
1700  * Determine whether we can remove a directory.  This routine checks whether
1701  * a directory is empty or not and enforces flush connectivity.
1702  *
1703  * Flush connectivity requires that we block if the target directory is
1704  * currently flushing, otherwise it may not end up in the same flush group.
1705  *
1706  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
1707  */
1708 int
1709 hammer_ip_check_directory_empty(hammer_transaction_t trans,
1710                         hammer_cursor_t parent_cursor, hammer_inode_t ip)
1711 {
1712         struct hammer_cursor cursor;
1713         int error;
1714
1715 #if 0
1716         /*
1717          * Check flush connectivity
1718          */
1719         if (ip->flush_state != HAMMER_FST_IDLE) {
1720                 kprintf("FWAIT\n");
1721                 hammer_done_cursor(parent_cursor);
1722                 hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
1723                 hammer_wait_inode(ip);
1724                 return (EDEADLK);
1725         }
1726 #endif
1727
1728         /*
1729          * Check directory empty
1730          */
1731         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1732
1733         cursor.key_beg.obj_id = ip->obj_id;
1734         cursor.key_beg.create_tid = 0;
1735         cursor.key_beg.delete_tid = 0;
1736         cursor.key_beg.obj_type = 0;
1737         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1738         cursor.key_beg.key = HAMMER_MIN_KEY;
1739
1740         cursor.key_end = cursor.key_beg;
1741         cursor.key_end.rec_type = 0xFFFF;
1742         cursor.key_end.key = HAMMER_MAX_KEY;
1743
1744         cursor.asof = ip->obj_asof;
1745         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1746
1747         error = hammer_ip_first(&cursor, ip);
1748         if (error == ENOENT)
1749                 error = 0;
1750         else if (error == 0)
1751                 error = ENOTEMPTY;
1752         hammer_done_cursor(&cursor);
1753         return(error);
1754 }
1755