hammer2 - Use B_IOISSUED
[dragonfly.git] / sys / vfs / hammer2 / hammer2_io.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer2.h"
36
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 static void dio_write_stats_update(hammer2_io_t *dio);
46
47 static int
48 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 {
50         if (io1->pbase < io2->pbase)
51                 return(-1);
52         if (io1->pbase > io2->pbase)
53                 return(1);
54         return(0);
55 }
56
57 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
58 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
59                 off_t, pbase);
60
61 struct hammer2_cleanupcb_info {
62         struct hammer2_io_tree tmptree;
63         int     count;
64 };
65
66 static __inline
67 uint64_t
68 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
69 {
70         uint64_t mask;
71         int i;
72
73         if (bytes < 1024)       /* smaller chunks not supported */
74                 return 0;
75
76         /*
77          * Calculate crc check mask for larger chunks
78          */
79         i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
80              HAMMER2_PBUFMASK) >> 10;
81         if (i == 0 && bytes == HAMMER2_PBUFSIZE)
82                 return((uint64_t)-1);
83         mask = ((uint64_t)1U << (bytes >> 10)) - 1;
84         mask <<= i;
85
86         return mask;
87 }
88
89 #define HAMMER2_GETBLK_GOOD     0
90 #define HAMMER2_GETBLK_QUEUED   1
91 #define HAMMER2_GETBLK_OWNED    2
92
93 /*
94  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
95  */
96 void
97 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
98                   hammer2_iocb_t *iocb)
99 {
100         hammer2_io_t *dio;
101         hammer2_io_t *xio;
102         off_t pbase;
103         off_t pmask;
104         /*
105          * XXX after free, buffer reuse case w/ different size can clash
106          * with dio cache.  Lets avoid it for now.  Ultimate we need to
107          * invalidate the dio cache when freeing blocks to allow a mix
108          * of 16KB and 64KB block sizes).
109          */
110         /*int psize = hammer2_devblksize(lsize);*/
111         int psize = HAMMER2_PBUFSIZE;
112         int refs;
113
114         pmask = ~(hammer2_off_t)(psize - 1);
115
116         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
117         lbase &= ~HAMMER2_OFF_MASK_RADIX;
118         pbase = lbase & pmask;
119         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
120
121         /*
122          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
123          */
124         hammer2_spin_sh(&hmp->io_spin);
125         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
126         if (dio) {
127                 if ((atomic_fetchadd_int(&dio->refs, 1) &
128                      HAMMER2_DIO_MASK) == 0) {
129                         atomic_add_int(&dio->hmp->iofree_count, -1);
130                 }
131                 hammer2_spin_unsh(&hmp->io_spin);
132         } else {
133                 hammer2_spin_unsh(&hmp->io_spin);
134                 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
135                 dio->hmp = hmp;
136                 dio->pbase = pbase;
137                 dio->psize = psize;
138                 dio->btype = iocb->btype;
139                 dio->refs = 1;
140                 hammer2_spin_init(&dio->spin, "h2dio");
141                 TAILQ_INIT(&dio->iocbq);
142                 hammer2_spin_ex(&hmp->io_spin);
143                 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
144                 if (xio == NULL) {
145                         atomic_add_int(&hammer2_dio_count, 1);
146                         hammer2_spin_unex(&hmp->io_spin);
147                 } else {
148                         if ((atomic_fetchadd_int(&xio->refs, 1) &
149                              HAMMER2_DIO_MASK) == 0) {
150                                 atomic_add_int(&xio->hmp->iofree_count, -1);
151                         }
152                         hammer2_spin_unex(&hmp->io_spin);
153                         kfree(dio, M_HAMMER2);
154                         dio = xio;
155                 }
156         }
157
158         /*
159          * Obtain/Validate the buffer.
160          */
161         iocb->dio = dio;
162
163         if (dio->act < 5)       /* SMP race ok */
164                 ++dio->act;
165
166         for (;;) {
167                 refs = dio->refs;
168                 cpu_ccfence();
169
170                 /*
171                  * Issue the iocb immediately if the buffer is already good.
172                  * Once set GOOD cannot be cleared until refs drops to 0.
173                  *
174                  * lfence required because dio's are not interlocked for
175                  * the DIO_GOOD test.
176                  */
177                 if (refs & HAMMER2_DIO_GOOD) {
178                         cpu_lfence();
179                         iocb->callback(iocb);
180                         break;
181                 }
182
183                 /*
184                  * Try to own the DIO by setting INPROG so we can issue
185                  * I/O on it.
186                  */
187                 if (refs & HAMMER2_DIO_INPROG) {
188                         /*
189                          * If DIO_INPROG is already set then set WAITING and
190                          * queue the iocb.
191                          */
192                         hammer2_spin_ex(&dio->spin);
193                         if (atomic_cmpset_int(&dio->refs, refs,
194                                               refs | HAMMER2_DIO_WAITING)) {
195                                 iocb->flags |= HAMMER2_IOCB_ONQ |
196                                                HAMMER2_IOCB_INPROG;
197                                 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
198                                 hammer2_spin_unex(&dio->spin);
199                                 break;
200                         }
201                         hammer2_spin_unex(&dio->spin);
202                         /* retry */
203                 } else {
204                         /*
205                          * If DIO_INPROG is not set then set it and issue the
206                          * callback immediately to start I/O.
207                          */
208                         if (atomic_cmpset_int(&dio->refs, refs,
209                                               refs | HAMMER2_DIO_INPROG)) {
210                                 iocb->flags |= HAMMER2_IOCB_INPROG;
211                                 iocb->callback(iocb);
212                                 break;
213                         }
214                         /* retry */
215                 }
216                 /* retry */
217         }
218 }
219
220 /*
221  * Quickly obtain a good DIO buffer, return NULL if the system no longer
222  * caches the data.
223  */
224 hammer2_io_t *
225 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
226 {
227         hammer2_iocb_t iocb;
228         hammer2_io_t *dio;
229         struct buf *bp;
230         off_t pbase;
231         off_t pmask;
232         int psize = HAMMER2_PBUFSIZE;
233         int orefs;
234         int nrefs;
235
236         pmask = ~(hammer2_off_t)(psize - 1);
237
238         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
239         lbase &= ~HAMMER2_OFF_MASK_RADIX;
240         pbase = lbase & pmask;
241         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
242
243         /*
244          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
245          */
246         hammer2_spin_sh(&hmp->io_spin);
247         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
248         if (dio == NULL) {
249                 hammer2_spin_unsh(&hmp->io_spin);
250                 return NULL;
251         }
252
253         if ((atomic_fetchadd_int(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
254                 atomic_add_int(&dio->hmp->iofree_count, -1);
255         hammer2_spin_unsh(&hmp->io_spin);
256
257         if (dio->act < 5)       /* SMP race ok */
258                 ++dio->act;
259
260         /*
261          * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
262          * the system does not have the data already cached.
263          */
264         nrefs = -1;
265         for (;;) {
266                 orefs = dio->refs;
267                 cpu_ccfence();
268
269                 /*
270                  * Issue the iocb immediately if the buffer is already good.
271                  * Once set GOOD cannot be cleared until refs drops to 0.
272                  *
273                  * lfence required because dio is not interlockedf for
274                  * the DIO_GOOD test.
275                  */
276                 if (orefs & HAMMER2_DIO_GOOD) {
277                         cpu_lfence();
278                         break;
279                 }
280
281                 /*
282                  * Try to own the DIO by setting INPROG so we can issue
283                  * I/O on it.  INPROG might already be set, in which case
284                  * there is no way we can do this non-blocking so we punt.
285                  */
286                 if ((orefs & HAMMER2_DIO_INPROG))
287                         break;
288                 nrefs = orefs | HAMMER2_DIO_INPROG;
289                 if (atomic_cmpset_int(&dio->refs, orefs, nrefs) == 0)
290                         continue;
291
292                 /*
293                  * We own DIO_INPROG, try to set DIO_GOOD.
294                  *
295                  * For now do not use GETBLK_NOWAIT because 
296                  */
297                 bp = dio->bp;
298                 dio->bp = NULL;
299                 if (bp == NULL) {
300 #if 0
301                         bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
302 #endif
303                         bread(hmp->devvp, dio->pbase, dio->psize, &bp);
304                 }
305                 if (bp) {
306                         if ((bp->b_flags & B_ERROR) == 0 &&
307                             (bp->b_flags & B_CACHE)) {
308                                 dio->bp = bp;   /* assign BEFORE setting flag */
309                                 atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
310                         } else {
311                                 bqrelse(bp);
312                                 bp = NULL;
313                         }
314                 }
315
316                 /*
317                  * Clear DIO_INPROG.
318                  *
319                  * This is actually a bit complicated, see
320                  * hammer2_io_complete() for more information.
321                  */
322                 iocb.dio = dio;
323                 iocb.flags = HAMMER2_IOCB_INPROG;
324                 hammer2_io_complete(&iocb);
325                 break;
326         }
327
328         /*
329          * Only return the dio if its buffer is good.
330          */
331         if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
332                 hammer2_io_putblk(&dio);
333         }
334         return dio;
335 }
336
337 /*
338  * The originator of the iocb is finished with it.
339  */
340 void
341 hammer2_io_complete(hammer2_iocb_t *iocb)
342 {
343         hammer2_io_t *dio = iocb->dio;
344         hammer2_iocb_t *cbtmp;
345         uint32_t orefs;
346         uint32_t nrefs;
347         uint32_t oflags;
348         uint32_t nflags;
349
350         /*
351          * If IOCB_INPROG was not set completion is synchronous due to the
352          * buffer already being good.  We can simply set IOCB_DONE and return.
353          * In this situation DIO_INPROG is not set and we have no visibility
354          * on dio->bp.
355          */
356         if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
357                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
358                 return;
359         }
360
361         /*
362          * The iocb was queued, obtained DIO_INPROG, and its callback was
363          * made.  The callback is now complete.  We still own DIO_INPROG.
364          *
365          * We can set DIO_GOOD if no error occurred, which gives certain
366          * stability guarantees to dio->bp and allows other accessors to
367          * short-cut access.  DIO_GOOD cannot be cleared until the last
368          * ref is dropped.
369          */
370         KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
371         if (dio->bp) {
372                 BUF_KERNPROC(dio->bp);
373                 if ((dio->bp->b_flags & B_ERROR) == 0) {
374                         KKASSERT(dio->bp->b_flags & B_CACHE);
375                         atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
376                 }
377         }
378
379         /*
380          * Clean up the dio before marking the iocb as being done.  If another
381          * iocb is pending we chain to it while leaving DIO_INPROG set (it
382          * will call io completion and presumably clear DIO_INPROG).
383          *
384          * Otherwise if no other iocbs are pending we clear DIO_INPROG before
385          * finishing up the cbio.  This means that DIO_INPROG is cleared at
386          * the end of the chain before ANY of the cbios are marked done.
387          *
388          * NOTE: The TAILQ is not stable until the spin-lock is held.
389          */
390         for (;;) {
391                 orefs = dio->refs;
392                 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
393
394                 if (orefs & HAMMER2_DIO_WAITING) {
395                         hammer2_spin_ex(&dio->spin);
396                         cbtmp = TAILQ_FIRST(&dio->iocbq);
397                         if (cbtmp) {
398                                 /*
399                                  * NOTE: flags not adjusted in this case.
400                                  *       Flags will be adjusted by the last
401                                  *       iocb.
402                                  */
403                                 TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
404                                 hammer2_spin_unex(&dio->spin);
405                                 cbtmp->callback(cbtmp); /* chained */
406                                 break;
407                         } else if (atomic_cmpset_int(&dio->refs,
408                                                      orefs, nrefs)) {
409                                 hammer2_spin_unex(&dio->spin);
410                                 break;
411                         }
412                         hammer2_spin_unex(&dio->spin);
413                         /* retry */
414                 } else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
415                         break;
416                 } /* else retry */
417                 /* retry */
418         }
419
420         /*
421          * Mark the iocb as done and wakeup any waiters.  This is done after
422          * all iocb chains have been called back and after DIO_INPROG has been
423          * cleared.  This avoids races against ref count drops by the waiting
424          * threads (a hard but not impossible SMP race) which might result in
425          * a 1->0 transition of the refs while DIO_INPROG is still set.
426          */
427         for (;;) {
428                 oflags = iocb->flags;
429                 cpu_ccfence();
430                 nflags = oflags;
431                 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
432                 nflags |= HAMMER2_IOCB_DONE;
433
434                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
435                         if (oflags & HAMMER2_IOCB_WAKEUP)
436                                 wakeup(iocb);
437                         /* SMP: iocb is now stale */
438                         break;
439                 }
440                 /* retry */
441         }
442         iocb = NULL;
443
444 }
445
446 /*
447  * Wait for an iocb's I/O to finish.
448  */
449 void
450 hammer2_iocb_wait(hammer2_iocb_t *iocb)
451 {
452         uint32_t oflags;
453         uint32_t nflags;
454
455         for (;;) {
456                 oflags = iocb->flags;
457                 cpu_ccfence();
458                 nflags = oflags | HAMMER2_IOCB_WAKEUP;
459                 if (oflags & HAMMER2_IOCB_DONE)
460                         break;
461                 tsleep_interlock(iocb, 0);
462                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
463                         tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
464                 }
465         }
466
467 }
468
469 /*
470  * Release our ref on *diop.
471  *
472  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
473  * then dispose of the underlying buffer.
474  */
475 void
476 hammer2_io_putblk(hammer2_io_t **diop)
477 {
478         hammer2_dev_t *hmp;
479         hammer2_io_t *dio;
480         hammer2_iocb_t iocb;
481         struct buf *bp;
482         off_t peof;
483         off_t pbase;
484         int psize;
485         int orefs;
486         int nrefs;
487
488         dio = *diop;
489         *diop = NULL;
490         hmp = dio->hmp;
491
492         /*
493          * Drop refs.
494          *
495          * On the 1->0 transition clear flags and set INPROG.
496          *
497          * On the 1->0 transition if INPROG is already set, another thread
498          * is in lastdrop and we can just return after the transition.
499          *
500          * On any other transition we can generally just return.
501          */
502         for (;;) {
503                 orefs = dio->refs;
504                 cpu_ccfence();
505                 nrefs = orefs - 1;
506
507                 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
508                     (orefs & HAMMER2_DIO_INPROG) == 0) {
509                         /*
510                          * Lastdrop case, INPROG can be set.
511                          */
512                         nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
513                         nrefs |= HAMMER2_DIO_INPROG;
514                         if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
515                                 break;
516                 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
517                         /*
518                          * Lastdrop case, INPROG already set.
519                          */
520                         if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
521                                 atomic_add_int(&hmp->iofree_count, 1);
522                                 return;
523                         }
524                 } else {
525                         /*
526                          * Normal drop case.
527                          */
528                         if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
529                                 return;
530                 }
531                 cpu_pause();
532                 /* retry */
533         }
534
535         /*
536          * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
537          * have been cleared.
538          *
539          * We can now dispose of the buffer, and should do it before calling
540          * io_complete() in case there's a race against a new reference
541          * which causes io_complete() to chain and instantiate the bp again.
542          */
543         pbase = dio->pbase;
544         psize = dio->psize;
545         bp = dio->bp;
546         dio->bp = NULL;
547
548         if (orefs & HAMMER2_DIO_GOOD) {
549                 KKASSERT(bp != NULL);
550                 if (orefs & HAMMER2_DIO_DIRTY) {
551                         int hce;
552
553                         dio_write_stats_update(dio);
554                         if ((hce = hammer2_cluster_enable) > 0) {
555                                 peof = (pbase + HAMMER2_SEGMASK64) &
556                                        ~HAMMER2_SEGMASK64;
557                                 cluster_write(bp, peof, psize, hce);
558                         } else {
559                                 bp->b_flags |= B_CLUSTEROK;
560                                 bdwrite(bp);
561                         }
562                 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
563                         brelse(bp);
564                 } else {
565                         bqrelse(bp);
566                 }
567         } else if (bp) {
568                 if (orefs & HAMMER2_DIO_DIRTY) {
569                         dio_write_stats_update(dio);
570                         bdwrite(bp);
571                 } else {
572                         brelse(bp);
573                 }
574         }
575
576         /*
577          * The instant we call io_complete dio is a free agent again and
578          * can be ripped out from under us.
579          *
580          * we can cleanup our final DIO_INPROG by simulating an iocb
581          * completion.
582          */
583         hmp = dio->hmp;                         /* extract fields */
584         atomic_add_int(&hmp->iofree_count, 1);
585         cpu_ccfence();
586
587         iocb.dio = dio;
588         iocb.flags = HAMMER2_IOCB_INPROG;
589         hammer2_io_complete(&iocb);
590         dio = NULL;                             /* dio stale */
591
592         /*
593          * We cache free buffers so re-use cases can use a shared lock, but
594          * if too many build up we have to clean them out.
595          */
596         if (hmp->iofree_count > 65536) {
597                 struct hammer2_cleanupcb_info info;
598
599                 RB_INIT(&info.tmptree);
600                 hammer2_spin_ex(&hmp->io_spin);
601                 if (hmp->iofree_count > 65536) {
602                         info.count = hmp->iofree_count / 4;
603                         RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
604                                 hammer2_io_cleanup_callback, &info);
605                 }
606                 hammer2_spin_unex(&hmp->io_spin);
607                 hammer2_io_cleanup(hmp, &info.tmptree);
608         }
609 }
610
611 /*
612  * Cleanup any dio's with (INPROG | refs) == 0.
613  *
614  * Called to clean up cached DIOs on umount after all activity has been
615  * flushed.
616  */
617 static
618 int
619 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
620 {
621         struct hammer2_cleanupcb_info *info = arg;
622         hammer2_io_t *xio;
623
624         if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
625                 if (dio->act > 0) {
626                         --dio->act;
627                         return 0;
628                 }
629                 KKASSERT(dio->bp == NULL);
630                 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
631                 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
632                 KKASSERT(xio == NULL);
633                 if (--info->count <= 0) /* limit scan */
634                         return(-1);
635         }
636         return 0;
637 }
638
639 void
640 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
641 {
642         hammer2_io_t *dio;
643
644         while ((dio = RB_ROOT(tree)) != NULL) {
645                 RB_REMOVE(hammer2_io_tree, tree, dio);
646                 KKASSERT(dio->bp == NULL &&
647                     (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
648                 kfree(dio, M_HAMMER2);
649                 atomic_add_int(&hammer2_dio_count, -1);
650                 atomic_add_int(&hmp->iofree_count, -1);
651         }
652 }
653
654 /*
655  * Returns a pointer to the requested data.
656  */
657 char *
658 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
659 {
660         struct buf *bp;
661         int off;
662
663         bp = dio->bp;
664         KKASSERT(bp != NULL);
665         off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
666         KKASSERT(off >= 0 && off < bp->b_bufsize);
667         return(bp->b_data + off);
668 }
669
670 #if 0
671 /*
672  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
673  * in the chain structure, but chain structure needs to be persistent as
674  * well on refs=0 and it isn't.
675  */
676 int
677 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
678 {
679         hammer2_io_t *dio;
680         uint64_t mask;
681
682         if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
683                 mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
684                 *maskp = mask;
685                 if ((dio->crc_good_mask & mask) == mask)
686                         return 1;
687                 return 0;
688         }
689         *maskp = 0;
690
691         return 0;
692 }
693
694 void
695 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
696 {
697         if (dio) {
698                 if (sizeof(long) == 8) {
699                         atomic_set_long(&dio->crc_good_mask, mask);
700                 } else {
701 #if _BYTE_ORDER == _LITTLE_ENDIAN
702                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
703                                         (uint32_t)mask);
704                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
705                                         (uint32_t)(mask >> 32));
706 #else
707                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
708                                         (uint32_t)(mask >> 32));
709                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
710                                         (uint32_t)mask);
711 #endif
712                 }
713         }
714 }
715
716 void
717 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
718 {
719         if (dio) {
720                 if (sizeof(long) == 8) {
721                         atomic_clear_long(&dio->crc_good_mask, mask);
722                 } else {
723 #if _BYTE_ORDER == _LITTLE_ENDIAN
724                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
725                                         (uint32_t)mask);
726                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
727                                         (uint32_t)(mask >> 32));
728 #else
729                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
730                                         (uint32_t)(mask >> 32));
731                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
732                                         (uint32_t)mask);
733 #endif
734                 }
735         }
736 }
737 #endif
738
739 /*
740  * Helpers for hammer2_io_new*() functions
741  */
742 static
743 void
744 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
745 {
746         hammer2_io_t *dio = iocb->dio;
747         int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
748
749         /*
750          * If IOCB_INPROG is not set the dio already has a good buffer and we
751          * can't mess with it other than zero the requested range.
752          *
753          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
754          * do what needs to be done with dio->bp.
755          */
756         if (iocb->flags & HAMMER2_IOCB_INPROG) {
757                 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
758                         if (iocb->lsize == dio->psize) {
759                                 /*
760                                  * Fully covered buffer, try to optimize to
761                                  * avoid any I/O.  We might already have the
762                                  * buffer due to iocb chaining.
763                                  */
764                                 if (dio->bp == NULL) {
765                                         dio->bp = getblk(dio->hmp->devvp,
766                                                          dio->pbase, dio->psize,
767                                                          gbctl, 0);
768                                 }
769                                 if (dio->bp) {
770                                         vfs_bio_clrbuf(dio->bp);
771                                         dio->bp->b_flags |= B_CACHE;
772                                 }
773                         } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
774                                 /*
775                                  * Partial buffer, quick mode.  Do nothing.
776                                  * Do not instantiate the buffer or try to
777                                  * mark it B_CACHE because other portions of
778                                  * the buffer might have to be read by other
779                                  * accessors.
780                                  */
781                         } else if (dio->bp == NULL ||
782                                    (dio->bp->b_flags & B_CACHE) == 0) {
783                                 /*
784                                  * Partial buffer, normal mode, requires
785                                  * read-before-write.  Chain the read.
786                                  *
787                                  * We might already have the buffer due to
788                                  * iocb chaining.  XXX unclear if we really
789                                  * need to write/release it and reacquire
790                                  * in that case.
791                                  *
792                                  * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
793                                  */
794                                 if (dio->bp) {
795                                         if (dio->refs & HAMMER2_DIO_DIRTY) {
796                                                 dio_write_stats_update(dio);
797                                                 bdwrite(dio->bp);
798                                         } else {
799                                                 bqrelse(dio->bp);
800                                         }
801                                         dio->bp = NULL;
802                                 }
803                                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
804                                 breadcb(dio->hmp->devvp,
805                                         dio->pbase, dio->psize,
806                                         hammer2_io_callback, iocb);
807                                 return;
808                         } /* else buffer is good */
809                 } /* else callback from breadcb is complete */
810         }
811         if (dio->bp) {
812                 if (iocb->flags & HAMMER2_IOCB_ZERO)
813                         bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
814                 atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
815         }
816         hammer2_io_complete(iocb);
817 }
818
819 static
820 int
821 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
822                 hammer2_io_t **diop, int flags)
823 {
824         hammer2_iocb_t iocb;
825         hammer2_io_t *dio;
826
827         iocb.callback = hammer2_iocb_new_callback;
828         iocb.cluster = NULL;
829         iocb.chain = NULL;
830         iocb.ptr = NULL;
831         iocb.lbase = lbase;
832         iocb.lsize = lsize;
833         iocb.flags = flags;
834         iocb.btype = btype;
835         iocb.error = 0;
836         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
837         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
838                 hammer2_iocb_wait(&iocb);
839         dio = *diop = iocb.dio;
840
841         return (iocb.error);
842 }
843
844 int
845 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
846                hammer2_io_t **diop)
847 {
848         return(_hammer2_io_new(hmp, btype, lbase, lsize,
849                                diop, HAMMER2_IOCB_ZERO));
850 }
851
852 int
853 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
854                  hammer2_io_t **diop)
855 {
856         return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
857 }
858
859 int
860 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
861                 hammer2_io_t **diop)
862 {
863         return(_hammer2_io_new(hmp, btype, lbase, lsize,
864                                diop, HAMMER2_IOCB_QUICK));
865 }
866
867 static
868 void
869 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
870 {
871         hammer2_io_t *dio = iocb->dio;
872         off_t peof;
873         int error;
874
875         /*
876          * If IOCB_INPROG is not set the dio already has a good buffer and we
877          * can't mess with it other than zero the requested range.
878          *
879          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
880          * do what needs to be done with dio->bp.
881          */
882         if (iocb->flags & HAMMER2_IOCB_INPROG) {
883                 int hce;
884
885                 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
886                         /*
887                          * Already good, likely due to being chained from
888                          * another iocb.
889                          */
890                         error = 0;
891                 } else if ((hce = hammer2_cluster_enable) > 0) {
892                         /*
893                          * Synchronous cluster I/O for now.
894                          */
895                         if (dio->bp) {
896                                 bqrelse(dio->bp);
897                                 dio->bp = NULL;
898                         }
899                         peof = (dio->pbase + HAMMER2_SEGMASK64) &
900                                ~HAMMER2_SEGMASK64;
901                         error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
902                                              dio->psize,
903                                              dio->psize, HAMMER2_PBUFSIZE*hce,
904                                              &dio->bp);
905                 } else {
906                         /*
907                          * Synchronous I/O for now.
908                          */
909                         if (dio->bp) {
910                                 bqrelse(dio->bp);
911                                 dio->bp = NULL;
912                         }
913                         error = bread(dio->hmp->devvp, dio->pbase,
914                                       dio->psize, &dio->bp);
915                 }
916                 if (error) {
917                         brelse(dio->bp);
918                         dio->bp = NULL;
919                 }
920         }
921         hammer2_io_complete(iocb);
922 }
923
924 int
925 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
926                 hammer2_io_t **diop)
927 {
928         hammer2_iocb_t iocb;
929         hammer2_io_t *dio;
930
931         iocb.callback = hammer2_iocb_bread_callback;
932         iocb.cluster = NULL;
933         iocb.chain = NULL;
934         iocb.ptr = NULL;
935         iocb.lbase = lbase;
936         iocb.lsize = lsize;
937         iocb.btype = btype;
938         iocb.flags = 0;
939         iocb.error = 0;
940         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
941         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
942                 hammer2_iocb_wait(&iocb);
943         dio = *diop = iocb.dio;
944
945         return (iocb.error);
946 }
947
948 /*
949  * System buf/bio async callback extracts the iocb and chains
950  * to the iocb callback.
951  */
952 void
953 hammer2_io_callback(struct bio *bio)
954 {
955         struct buf *dbp = bio->bio_buf;
956         hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
957         hammer2_io_t *dio;
958
959         dio = iocb->dio;
960         if ((bio->bio_flags & BIO_DONE) == 0)
961                 bpdone(dbp, 0);
962         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
963         dio->bp = bio->bio_buf;
964         iocb->callback(iocb);
965 }
966
967 void
968 hammer2_io_bawrite(hammer2_io_t **diop)
969 {
970         atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
971         hammer2_io_putblk(diop);
972 }
973
974 void
975 hammer2_io_bdwrite(hammer2_io_t **diop)
976 {
977         atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
978         hammer2_io_putblk(diop);
979 }
980
981 int
982 hammer2_io_bwrite(hammer2_io_t **diop)
983 {
984         atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
985         hammer2_io_putblk(diop);
986         return (0);     /* XXX */
987 }
988
989 void
990 hammer2_io_setdirty(hammer2_io_t *dio)
991 {
992         atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
993 }
994
995 void
996 hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
997 {
998 #if 0
999         uint64_t mask = hammer2_io_mask(dio, off, bytes);
1000         hammer2_io_crc_clrmask(dio, mask);
1001 #endif
1002         if ((u_int)dio->psize == bytes) {
1003                 dio->bp->b_flags |= B_INVAL | B_RELBUF;
1004                 /* dio->bp->b_flags &= ~B_CACHE; not needed */
1005         }
1006 }
1007
1008 void
1009 hammer2_io_brelse(hammer2_io_t **diop)
1010 {
1011         hammer2_io_putblk(diop);
1012 }
1013
1014 void
1015 hammer2_io_bqrelse(hammer2_io_t **diop)
1016 {
1017         hammer2_io_putblk(diop);
1018 }
1019
1020 int
1021 hammer2_io_isdirty(hammer2_io_t *dio)
1022 {
1023         return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1024 }
1025
1026 static
1027 void
1028 dio_write_stats_update(hammer2_io_t *dio)
1029 {
1030         long *counterp;
1031
1032         switch(dio->btype) {
1033         case 0:
1034                 return;
1035         case HAMMER2_BREF_TYPE_DATA:
1036                 counterp = &hammer2_iod_file_write;
1037                 break;
1038         case HAMMER2_BREF_TYPE_INODE:
1039                 counterp = &hammer2_iod_meta_write;
1040                 break;
1041         case HAMMER2_BREF_TYPE_INDIRECT:
1042                 counterp = &hammer2_iod_indr_write;
1043                 break;
1044         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1045         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1046                 counterp = &hammer2_iod_fmap_write;
1047                 break;
1048         default:
1049                 counterp = &hammer2_iod_volu_write;
1050                 break;
1051         }
1052         *counterp += dio->psize;
1053 }