a2b11e1de23f4d239d48daa1c7dfd1fd8f596722
[dragonfly.git] / sys / vfs / hammer2 / hammer2_io.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer2.h"
36
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used as an OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  * The DIOs also record temporary state with limited persistence.  This
44  * feature is used to keep track of dedupable blocks.
45  */
46 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
47 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
48
49 static int
50 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
51 {
52         if (io1->pbase < io2->pbase)
53                 return(-1);
54         if (io1->pbase > io2->pbase)
55                 return(1);
56         return(0);
57 }
58
59 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
60 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
61                 off_t, pbase);
62
63 struct hammer2_cleanupcb_info {
64         struct hammer2_io_tree tmptree;
65         int     count;
66 };
67
68 static __inline
69 uint64_t
70 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
71 {
72         uint64_t mask;
73         int i;
74
75         if (bytes < 1024)       /* smaller chunks not supported */
76                 return 0;
77
78         /*
79          * Calculate crc check mask for larger chunks
80          */
81         i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
82              HAMMER2_PBUFMASK) >> 10;
83         if (i == 0 && bytes == HAMMER2_PBUFSIZE)
84                 return((uint64_t)-1);
85         mask = ((uint64_t)1U << (bytes >> 10)) - 1;
86         mask <<= i;
87
88         return mask;
89 }
90
91 #define HAMMER2_GETBLK_GOOD     0
92 #define HAMMER2_GETBLK_QUEUED   1
93 #define HAMMER2_GETBLK_OWNED    2
94
95 /*
96  * Returns the DIO corresponding to the data|radix, creating it if necessary.
97  *
98  * If createit is 0, NULL can be returned indicating that the DIO does not
99  * exist.  (btype) is ignored when createit is 0.
100  */
101 static __inline
102 hammer2_io_t *
103 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
104                  int createit)
105 {
106         hammer2_io_t *dio;
107         hammer2_io_t *xio;
108         hammer2_key_t lbase;
109         hammer2_key_t pbase;
110         hammer2_key_t pmask;
111         int lsize;
112         int psize;
113
114         psize = HAMMER2_PBUFSIZE;
115         pmask = ~(hammer2_off_t)(psize - 1);
116         lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
117         lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
118         pbase = lbase & pmask;
119
120         if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
121                 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
122                         pbase, lbase, lsize, pmask);
123         }
124         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
125
126         /*
127          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
128          */
129         hammer2_spin_sh(&hmp->io_spin);
130         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
131         if (dio) {
132                 if ((atomic_fetchadd_64(&dio->refs, 1) &
133                      HAMMER2_DIO_MASK) == 0) {
134                         atomic_add_int(&dio->hmp->iofree_count, -1);
135                 }
136                 hammer2_spin_unsh(&hmp->io_spin);
137         } else if (createit) {
138                 hammer2_spin_unsh(&hmp->io_spin);
139                 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
140                 dio->hmp = hmp;
141                 dio->pbase = pbase;
142                 dio->psize = psize;
143                 dio->btype = btype;
144                 dio->refs = 1;
145                 dio->act = 5;
146                 hammer2_spin_init(&dio->spin, "h2dio");
147                 TAILQ_INIT(&dio->iocbq);
148                 hammer2_spin_ex(&hmp->io_spin);
149                 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
150                 if (xio == NULL) {
151                         atomic_add_int(&hammer2_dio_count, 1);
152                         hammer2_spin_unex(&hmp->io_spin);
153                 } else {
154                         if ((atomic_fetchadd_64(&xio->refs, 1) &
155                              HAMMER2_DIO_MASK) == 0) {
156                                 atomic_add_int(&xio->hmp->iofree_count, -1);
157                         }
158                         hammer2_spin_unex(&hmp->io_spin);
159                         kfree(dio, M_HAMMER2);
160                         dio = xio;
161                 }
162         } else {
163                 hammer2_spin_unsh(&hmp->io_spin);
164                 return NULL;
165         }
166         dio->ticks = ticks;
167         if (dio->act < 10)
168                 ++dio->act;
169
170         return dio;
171 }
172
173 /*
174  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
175  */
176 void
177 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
178                   hammer2_iocb_t *iocb)
179 {
180         hammer2_io_t *dio;
181         uint64_t refs;
182
183         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
184         dio = hammer2_io_alloc(hmp, lbase, iocb->btype, 1);
185
186         iocb->dio = dio;
187
188         for (;;) {
189                 refs = dio->refs;
190                 cpu_ccfence();
191
192                 /*
193                  * Issue the iocb immediately if the buffer is already good.
194                  * Once set GOOD cannot be cleared until refs drops to 0.
195                  *
196                  * lfence required because dio's are not interlocked for
197                  * the DIO_GOOD test.
198                  */
199                 if (refs & HAMMER2_DIO_GOOD) {
200                         cpu_lfence();
201                         iocb->callback(iocb);
202                         break;
203                 }
204
205                 /*
206                  * Try to own the DIO by setting INPROG so we can issue
207                  * I/O on it.
208                  */
209                 if (refs & HAMMER2_DIO_INPROG) {
210                         /*
211                          * If DIO_INPROG is already set then set WAITING and
212                          * queue the iocb.
213                          */
214                         hammer2_spin_ex(&dio->spin);
215                         if (atomic_cmpset_64(&dio->refs, refs,
216                                               refs | HAMMER2_DIO_WAITING)) {
217                                 iocb->flags |= HAMMER2_IOCB_ONQ |
218                                                HAMMER2_IOCB_INPROG;
219                                 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
220                                 hammer2_spin_unex(&dio->spin);
221                                 break;
222                         }
223                         hammer2_spin_unex(&dio->spin);
224                         /* retry */
225                 } else {
226                         /*
227                          * If DIO_INPROG is not set then set it and issue the
228                          * callback immediately to start I/O.
229                          */
230                         if (atomic_cmpset_64(&dio->refs, refs,
231                                               refs | HAMMER2_DIO_INPROG)) {
232                                 iocb->flags |= HAMMER2_IOCB_INPROG;
233                                 iocb->callback(iocb);
234                                 break;
235                         }
236                         /* retry */
237                 }
238                 /* retry */
239         }
240 }
241
242 /*
243  * Quickly obtain a good DIO buffer, return NULL if the system no longer
244  * caches the data.
245  */
246 hammer2_io_t *
247 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, int notgood)
248 {
249         hammer2_iocb_t iocb;
250         hammer2_io_t *dio;
251         struct buf *bp;
252         off_t pbase;
253         off_t pmask;
254         int psize = HAMMER2_PBUFSIZE;
255         uint64_t orefs;
256         uint64_t nrefs;
257
258         pmask = ~(hammer2_off_t)(psize - 1);
259
260         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
261         lbase &= ~HAMMER2_OFF_MASK_RADIX;
262         pbase = lbase & pmask;
263         if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
264                 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
265                         pbase, lbase, lsize, pmask);
266         }
267         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
268
269         /*
270          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
271          */
272         hammer2_spin_sh(&hmp->io_spin);
273         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
274         if (dio == NULL) {
275                 hammer2_spin_unsh(&hmp->io_spin);
276                 return NULL;
277         }
278
279         if ((atomic_fetchadd_64(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
280                 atomic_add_int(&dio->hmp->iofree_count, -1);
281         hammer2_spin_unsh(&hmp->io_spin);
282
283         dio->ticks = ticks;
284         if (dio->act < 10)
285                 ++dio->act;             /* SMP race ok */
286
287         /*
288          * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
289          * the system does not have the data already cached.
290          */
291         nrefs = (uint64_t)-1;
292         for (;;) {
293                 orefs = dio->refs;
294                 cpu_ccfence();
295
296                 /*
297                  * Issue the iocb immediately if the buffer is already good.
298                  * Once set GOOD cannot be cleared until refs drops to 0.
299                  *
300                  * lfence required because dio is not interlockedf for
301                  * the DIO_GOOD test.
302                  */
303                 if (orefs & HAMMER2_DIO_GOOD) {
304                         cpu_lfence();
305                         break;
306                 }
307
308                 /*
309                  * Try to own the DIO by setting INPROG so we can issue
310                  * I/O on it.  INPROG might already be set, in which case
311                  * there is no way we can do this non-blocking so we punt.
312                  */
313                 if ((orefs & HAMMER2_DIO_INPROG))
314                         break;
315                 nrefs = orefs | HAMMER2_DIO_INPROG;
316                 if (atomic_cmpset_64(&dio->refs, orefs, nrefs) == 0)
317                         continue;
318
319                 /*
320                  * We own DIO_INPROG, try to set DIO_GOOD.
321                  *
322                  * If (notgood) specified caller just wants the dio and doesn't
323                  * care about the buffer a whole lot.  However, if the buffer
324                  * is good (or dirty), we still want to return it.
325                  *
326                  * Otherwise we are trying to resolve a dedup and bread()
327                  * is expected to always be better than building a new buffer
328                  * that will be written.  Use bread() for better determinism
329                  * than getblk().
330                  */
331                 bp = dio->bp;
332                 dio->bp = NULL;
333                 if (bp == NULL) {
334                         if (notgood)
335                                 bp = getblk(hmp->devvp, dio->pbase,
336                                             dio->psize, 0, 0);
337                         else
338                                 bread(hmp->devvp, dio->pbase, dio->psize, &bp);
339                 }
340
341                 /*
342                  * System buffer must also have remained cached.
343                  */
344                 if (bp) {
345                         if ((bp->b_flags & B_ERROR) == 0 &&
346                             (bp->b_flags & B_CACHE)) {
347                                 dio->bp = bp;   /* assign BEFORE setting flag */
348                                 atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
349                         } else {
350                                 bqrelse(bp);
351                                 bp = NULL;
352                         }
353                 }
354
355                 /*
356                  * Clear DIO_INPROG.
357                  *
358                  * This is actually a bit complicated, see
359                  * hammer2_io_complete() for more information.
360                  */
361                 iocb.dio = dio;
362                 iocb.flags = HAMMER2_IOCB_INPROG;
363                 hammer2_io_complete(&iocb);
364                 break;
365         }
366
367         /*
368          * Only return the dio if its buffer is good.  If notgood != 0,
369          * we return the buffer regardless (so ephermal dedup bits can be
370          * cleared).
371          */
372         if (notgood == 0 && (dio->refs & HAMMER2_DIO_GOOD) == 0) {
373                 hammer2_io_putblk(&dio);
374         }
375         return dio;
376 }
377
378 /*
379  * The originator of the iocb is finished with it.
380  *
381  * WARNING: iocb may be partially initialized with only iocb->dio and
382  *          iocb->flags.
383  */
384 void
385 hammer2_io_complete(hammer2_iocb_t *iocb)
386 {
387         hammer2_io_t *dio = iocb->dio;
388         hammer2_iocb_t *cbtmp;
389         uint64_t orefs;
390         uint64_t nrefs;
391         uint32_t oflags;
392         uint32_t nflags;
393
394         /*
395          * If IOCB_INPROG was not set completion is synchronous due to the
396          * buffer already being good.  We can simply set IOCB_DONE and return.
397          *
398          * In this situation DIO_INPROG is not set and we have no visibility
399          * on dio->bp.  We should not try to mess with dio->bp because another
400          * thread may be finishing up its processing.  dio->bp should already
401          * be set to BUF_KERNPROC()!
402          */
403         if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
404                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
405                 return;
406         }
407
408         /*
409          * The iocb was queued, obtained DIO_INPROG, and its callback was
410          * made.  The callback is now complete.  We still own DIO_INPROG.
411          *
412          * We can set DIO_GOOD if no error occurred, which gives certain
413          * stability guarantees to dio->bp and allows other accessors to
414          * short-cut access.  DIO_GOOD cannot be cleared until the last
415          * ref is dropped.
416          */
417         KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
418         if (dio->bp) {
419                 BUF_KERNPROC(dio->bp);
420                 if ((dio->bp->b_flags & B_ERROR) == 0) {
421                         KKASSERT(dio->bp->b_flags & B_CACHE);
422                         atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
423                 }
424         }
425
426         /*
427          * Clean up the dio before marking the iocb as being done.  If another
428          * iocb is pending we chain to it while leaving DIO_INPROG set (it
429          * will call io completion and presumably clear DIO_INPROG).
430          *
431          * Otherwise if no other iocbs are pending we clear DIO_INPROG before
432          * finishing up the cbio.  This means that DIO_INPROG is cleared at
433          * the end of the chain before ANY of the cbios are marked done.
434          *
435          * NOTE: The TAILQ is not stable until the spin-lock is held.
436          */
437         for (;;) {
438                 orefs = dio->refs;
439                 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
440
441                 if (orefs & HAMMER2_DIO_WAITING) {
442                         hammer2_spin_ex(&dio->spin);
443                         cbtmp = TAILQ_FIRST(&dio->iocbq);
444                         if (cbtmp) {
445                                 /*
446                                  * NOTE: flags not adjusted in this case.
447                                  *       Flags will be adjusted by the last
448                                  *       iocb.
449                                  */
450                                 TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
451                                 hammer2_spin_unex(&dio->spin);
452                                 cbtmp->callback(cbtmp); /* chained */
453                                 break;
454                         } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
455                                 hammer2_spin_unex(&dio->spin);
456                                 break;
457                         }
458                         hammer2_spin_unex(&dio->spin);
459                         /* retry */
460                 } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
461                         break;
462                 } /* else retry */
463                 /* retry */
464         }
465
466         /*
467          * Mark the iocb as done and wakeup any waiters.  This is done after
468          * all iocb chains have been called back and after DIO_INPROG has been
469          * cleared.  This avoids races against ref count drops by the waiting
470          * threads (a hard but not impossible SMP race) which might result in
471          * a 1->0 transition of the refs while DIO_INPROG is still set.
472          */
473         for (;;) {
474                 oflags = iocb->flags;
475                 cpu_ccfence();
476                 nflags = oflags;
477                 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
478                 nflags |= HAMMER2_IOCB_DONE;
479
480                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
481                         if (oflags & HAMMER2_IOCB_WAKEUP)
482                                 wakeup(iocb);
483                         /* SMP: iocb is now stale */
484                         break;
485                 }
486                 /* retry */
487         }
488         iocb = NULL;
489
490 }
491
492 /*
493  * Wait for an iocb's I/O to finish.
494  */
495 void
496 hammer2_iocb_wait(hammer2_iocb_t *iocb)
497 {
498         uint32_t oflags;
499         uint32_t nflags;
500
501         for (;;) {
502                 oflags = iocb->flags;
503                 cpu_ccfence();
504                 nflags = oflags | HAMMER2_IOCB_WAKEUP;
505                 if (oflags & HAMMER2_IOCB_DONE)
506                         break;
507                 tsleep_interlock(iocb, 0);
508                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
509                         tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
510                 }
511         }
512
513 }
514
515 /*
516  * Release our ref on *diop.
517  *
518  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
519  * then dispose of the underlying buffer.
520  */
521 void
522 hammer2_io_putblk(hammer2_io_t **diop)
523 {
524         hammer2_dev_t *hmp;
525         hammer2_io_t *dio;
526         hammer2_iocb_t iocb;
527         struct buf *bp;
528         off_t peof;
529         off_t pbase;
530         int psize;
531         int limit_dio;
532         uint64_t orefs;
533         uint64_t nrefs;
534
535         dio = *diop;
536         *diop = NULL;
537         hmp = dio->hmp;
538
539         KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
540
541         /*
542          * Drop refs.
543          *
544          * On the 1->0 transition clear flags and set INPROG.
545          *
546          * On the 1->0 transition if INPROG is already set, another thread
547          * is in lastdrop and we can just return after the transition.
548          *
549          * On any other transition we can generally just return.
550          */
551         for (;;) {
552                 orefs = dio->refs;
553                 cpu_ccfence();
554                 nrefs = orefs - 1;
555
556                 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
557                     (orefs & HAMMER2_DIO_INPROG) == 0) {
558                         /*
559                          * Lastdrop case, INPROG can be set.
560                          */
561                         nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
562                         nrefs |= HAMMER2_DIO_INPROG;
563                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
564                                 break;
565                 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
566                         /*
567                          * Lastdrop case, INPROG already set.
568                          */
569                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
570                                 atomic_add_int(&hmp->iofree_count, 1);
571                                 return;
572                         }
573                 } else {
574                         /*
575                          * Normal drop case.
576                          */
577                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
578                                 return;
579                 }
580                 cpu_pause();
581                 /* retry */
582         }
583
584         /*
585          * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
586          * have been cleared.  iofree_count has not yet been incremented,
587          * note that another accessor race will decrement iofree_count so
588          * we have to increment it regardless.
589          *
590          * We can now dispose of the buffer, and should do it before calling
591          * io_complete() in case there's a race against a new reference
592          * which causes io_complete() to chain and instantiate the bp again.
593          */
594         pbase = dio->pbase;
595         psize = dio->psize;
596         bp = dio->bp;
597         dio->bp = NULL;
598
599         if (orefs & HAMMER2_DIO_GOOD) {
600                 KKASSERT(bp != NULL);
601 #if 0
602                 if (hammer2_inval_enable &&
603                     (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
604                         ++hammer2_iod_invals;
605                         bp->b_flags |= B_INVAL | B_RELBUF;
606                         brelse(bp);
607                 } else
608 #endif
609                 if (orefs & HAMMER2_DIO_DIRTY) {
610                         int hce;
611
612                         dio_write_stats_update(dio, bp);
613                         if ((hce = hammer2_cluster_write) > 0) {
614                                 /*
615                                  * Allows write-behind to keep the buffer
616                                  * cache sane.
617                                  */
618                                 peof = (pbase + HAMMER2_SEGMASK64) &
619                                        ~HAMMER2_SEGMASK64;
620                                 bp->b_flags |= B_CLUSTEROK;
621                                 cluster_write(bp, peof, psize, hce);
622                         } else {
623                                 /*
624                                  * Allows dirty buffers to accumulate and
625                                  * possibly be canceled (e.g. by a 'rm'),
626                                  * will burst-write later.
627                                  */
628                                 bp->b_flags |= B_CLUSTEROK;
629                                 bdwrite(bp);
630                         }
631                 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
632                         brelse(bp);
633                 } else {
634                         bqrelse(bp);
635                 }
636         } else if (bp) {
637 #if 0
638                 if (hammer2_inval_enable &&
639                     (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
640                         ++hammer2_iod_invals;
641                         bp->b_flags |= B_INVAL | B_RELBUF;
642                         brelse(bp);
643                 } else
644 #endif
645                 if (orefs & HAMMER2_DIO_DIRTY) {
646                         dio_write_stats_update(dio, bp);
647                         bdwrite(bp);
648                 } else {
649                         bqrelse(bp);
650                 }
651         }
652
653         /*
654          * The instant we call io_complete dio is a free agent again and
655          * can be ripped out from under us.
656          *
657          * we can cleanup our final DIO_INPROG by simulating an iocb
658          * completion.
659          */
660         hmp = dio->hmp;                         /* extract fields */
661         atomic_add_int(&hmp->iofree_count, 1);
662         cpu_ccfence();
663
664         iocb.dio = dio;
665         iocb.flags = HAMMER2_IOCB_INPROG;
666         hammer2_io_complete(&iocb);
667         dio = NULL;                             /* dio stale */
668
669         /*
670          * We cache free buffers so re-use cases can use a shared lock, but
671          * if too many build up we have to clean them out.
672          */
673         limit_dio = hammer2_limit_dio;
674         if (limit_dio < 256)
675                 limit_dio = 256;
676         if (limit_dio > 1024*1024)
677                 limit_dio = 1024*1024;
678         if (hmp->iofree_count > limit_dio) {
679                 struct hammer2_cleanupcb_info info;
680
681                 RB_INIT(&info.tmptree);
682                 hammer2_spin_ex(&hmp->io_spin);
683                 if (hmp->iofree_count > limit_dio) {
684                         info.count = hmp->iofree_count / 5;
685                         RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
686                                 hammer2_io_cleanup_callback, &info);
687                 }
688                 hammer2_spin_unex(&hmp->io_spin);
689                 hammer2_io_cleanup(hmp, &info.tmptree);
690         }
691 }
692
693 /*
694  * Cleanup any dio's with (INPROG | refs) == 0.
695  *
696  * Called to clean up cached DIOs on umount after all activity has been
697  * flushed.
698  */
699 static
700 int
701 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
702 {
703         struct hammer2_cleanupcb_info *info = arg;
704         hammer2_io_t *xio;
705
706         if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
707                 if (dio->act > 0) {
708                         int act;
709
710                         act = dio->act - (ticks - dio->ticks) / hz - 1;
711                         if (act > 0) {
712                                 dio->act = act;
713                                 return 0;
714                         }
715                         dio->act = 0;
716                 }
717                 KKASSERT(dio->bp == NULL);
718                 if (info->count > 0) {
719                         RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
720                         xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
721                         KKASSERT(xio == NULL);
722                         --info->count;
723                 }
724         }
725         return 0;
726 }
727
728 void
729 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
730 {
731         hammer2_io_t *dio;
732
733         while ((dio = RB_ROOT(tree)) != NULL) {
734                 RB_REMOVE(hammer2_io_tree, tree, dio);
735                 KKASSERT(dio->bp == NULL &&
736                     (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
737                 kfree(dio, M_HAMMER2);
738                 atomic_add_int(&hammer2_dio_count, -1);
739                 atomic_add_int(&hmp->iofree_count, -1);
740         }
741 }
742
743 /*
744  * Returns a pointer to the requested data.
745  */
746 char *
747 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
748 {
749         struct buf *bp;
750         int off;
751
752         bp = dio->bp;
753         KKASSERT(bp != NULL);
754         off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
755         KKASSERT(off >= 0 && off < bp->b_bufsize);
756         return(bp->b_data + off);
757 }
758
759 #if 0
760 /*
761  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
762  * in the chain structure, but chain structure needs to be persistent as
763  * well on refs=0 and it isn't.
764  */
765 int
766 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
767 {
768         hammer2_io_t *dio;
769         uint64_t mask;
770
771         if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
772                 mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
773                 *maskp = mask;
774                 if ((dio->crc_good_mask & mask) == mask)
775                         return 1;
776                 return 0;
777         }
778         *maskp = 0;
779
780         return 0;
781 }
782
783 void
784 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
785 {
786         if (dio) {
787                 if (sizeof(long) == 8) {
788                         atomic_set_long(&dio->crc_good_mask, mask);
789                 } else {
790 #if _BYTE_ORDER == _LITTLE_ENDIAN
791                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
792                                         (uint32_t)mask);
793                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
794                                         (uint32_t)(mask >> 32));
795 #else
796                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
797                                         (uint32_t)(mask >> 32));
798                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
799                                         (uint32_t)mask);
800 #endif
801                 }
802         }
803 }
804
805 void
806 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
807 {
808         if (dio) {
809                 if (sizeof(long) == 8) {
810                         atomic_clear_long(&dio->crc_good_mask, mask);
811                 } else {
812 #if _BYTE_ORDER == _LITTLE_ENDIAN
813                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
814                                         (uint32_t)mask);
815                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
816                                         (uint32_t)(mask >> 32));
817 #else
818                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
819                                         (uint32_t)(mask >> 32));
820                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
821                                         (uint32_t)mask);
822 #endif
823                 }
824         }
825 }
826 #endif
827
828 /*
829  * Helpers for hammer2_io_new*() functions
830  */
831 static
832 void
833 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
834 {
835         hammer2_io_t *dio = iocb->dio;
836         int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
837
838         /*
839          * If IOCB_INPROG is not set the dio already has a good buffer and we
840          * can't mess with it other than zero the requested range.
841          *
842          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
843          * do what needs to be done with dio->bp.
844          */
845         if (iocb->flags & HAMMER2_IOCB_INPROG) {
846                 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
847                         if (iocb->lsize == dio->psize) {
848                                 /*
849                                  * Fully covered buffer, try to optimize to
850                                  * avoid any I/O.  We might already have the
851                                  * buffer due to iocb chaining.
852                                  */
853                                 if (dio->bp == NULL) {
854                                         dio->bp = getblk(dio->hmp->devvp,
855                                                          dio->pbase, dio->psize,
856                                                          gbctl, 0);
857                                 }
858                                 if (dio->bp) {
859                                         vfs_bio_clrbuf(dio->bp);
860                                         dio->bp->b_flags |= B_CACHE;
861                                 }
862                         } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
863                                 /*
864                                  * Partial buffer, quick mode.  Do nothing.
865                                  * Do not instantiate the buffer or try to
866                                  * mark it B_CACHE because other portions of
867                                  * the buffer might have to be read by other
868                                  * accessors.
869                                  */
870                         } else if (dio->bp == NULL ||
871                                    (dio->bp->b_flags & B_CACHE) == 0) {
872                                 /*
873                                  * Partial buffer, normal mode, requires
874                                  * read-before-write.  Chain the read.
875                                  *
876                                  * We might already have the buffer due to
877                                  * iocb chaining.  XXX unclear if we really
878                                  * need to write/release it and reacquire
879                                  * in that case.
880                                  *
881                                  * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
882                                  */
883                                 if (dio->bp) {
884                                         if (dio->refs & HAMMER2_DIO_DIRTY) {
885                                                 dio_write_stats_update(dio,
886                                                                        dio->bp);
887                                                 bdwrite(dio->bp);
888                                         } else {
889                                                 bqrelse(dio->bp);
890                                         }
891                                         dio->bp = NULL;
892                                 }
893                                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
894                                 breadcb(dio->hmp->devvp,
895                                         dio->pbase, dio->psize,
896                                         hammer2_io_callback, iocb);
897                                 return;
898                         } /* else buffer is good */
899                 } /* else callback from breadcb is complete */
900         }
901         if (dio->bp) {
902                 if (iocb->flags & HAMMER2_IOCB_ZERO)
903                         bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
904                 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
905         }
906         hammer2_io_complete(iocb);
907 }
908
909 static
910 int
911 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
912                 hammer2_io_t **diop, int flags)
913 {
914         hammer2_iocb_t iocb;
915
916         iocb.callback = hammer2_iocb_new_callback;
917         iocb.chain = NULL;
918         iocb.ptr = NULL;
919         iocb.lbase = lbase;
920         iocb.lsize = lsize;
921         iocb.flags = flags;
922         iocb.btype = btype;
923         iocb.error = 0;
924         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
925         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
926                 hammer2_iocb_wait(&iocb);
927         *diop = iocb.dio;
928
929         return (iocb.error);
930 }
931
932 int
933 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
934                hammer2_io_t **diop)
935 {
936         return(_hammer2_io_new(hmp, btype, lbase, lsize,
937                                diop, HAMMER2_IOCB_ZERO));
938 }
939
940 int
941 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
942                  hammer2_io_t **diop)
943 {
944         return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
945 }
946
947 /*
948  * This is called from the freemap to pre-validate a full-sized buffer
949  * whos contents we don't care about, in order to prevent an unnecessary
950  * read-before-write.
951  */
952 void
953 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize)
954 {
955         hammer2_io_t *dio = NULL;
956
957         _hammer2_io_new(hmp, btype, lbase, lsize, &dio, HAMMER2_IOCB_QUICK);
958         hammer2_io_bqrelse(&dio);
959 }
960
961 static
962 void
963 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
964 {
965         hammer2_io_t *dio = iocb->dio;
966         off_t peof;
967         int error;
968
969         /*
970          * If IOCB_INPROG is not set the dio already has a good buffer and we
971          * can't mess with it other than zero the requested range.
972          *
973          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
974          * do what needs to be done with dio->bp.
975          */
976         if (iocb->flags & HAMMER2_IOCB_INPROG) {
977                 int hce;
978
979                 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
980                         /*
981                          * Already good, likely due to being chained from
982                          * another iocb.
983                          */
984                         error = 0;
985                 } else if ((hce = hammer2_cluster_read) > 0) {
986                         /*
987                          * Synchronous cluster I/O for now.
988                          */
989                         if (dio->bp) {
990                                 bqrelse(dio->bp);
991                                 dio->bp = NULL;
992                         }
993                         peof = (dio->pbase + HAMMER2_SEGMASK64) &
994                                ~HAMMER2_SEGMASK64;
995                         error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
996                                              dio->psize,
997                                              dio->psize, HAMMER2_PBUFSIZE*hce,
998                                              &dio->bp);
999                 } else {
1000                         /*
1001                          * Synchronous I/O for now.
1002                          */
1003                         if (dio->bp) {
1004                                 bqrelse(dio->bp);
1005                                 dio->bp = NULL;
1006                         }
1007                         error = bread(dio->hmp->devvp, dio->pbase,
1008                                       dio->psize, &dio->bp);
1009                 }
1010                 if (error) {
1011                         brelse(dio->bp);
1012                         dio->bp = NULL;
1013                 }
1014         }
1015         hammer2_io_complete(iocb);
1016 }
1017
1018 int
1019 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
1020                 hammer2_io_t **diop)
1021 {
1022         hammer2_iocb_t iocb;
1023
1024         iocb.callback = hammer2_iocb_bread_callback;
1025         iocb.chain = NULL;
1026         iocb.ptr = NULL;
1027         iocb.lbase = lbase;
1028         iocb.lsize = lsize;
1029         iocb.btype = btype;
1030         iocb.flags = 0;
1031         iocb.error = 0;
1032         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
1033         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
1034                 hammer2_iocb_wait(&iocb);
1035         *diop = iocb.dio;
1036
1037         return (iocb.error);
1038 }
1039
1040 /*
1041  * System buf/bio async callback extracts the iocb and chains
1042  * to the iocb callback.
1043  */
1044 void
1045 hammer2_io_callback(struct bio *bio)
1046 {
1047         struct buf *dbp = bio->bio_buf;
1048         hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
1049         hammer2_io_t *dio;
1050
1051         dio = iocb->dio;
1052         if ((bio->bio_flags & BIO_DONE) == 0)
1053                 bpdone(dbp, 0);
1054         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1055         dio->bp = bio->bio_buf;
1056         iocb->callback(iocb);
1057 }
1058
1059 void
1060 hammer2_io_bawrite(hammer2_io_t **diop)
1061 {
1062         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1063         hammer2_io_putblk(diop);
1064 }
1065
1066 void
1067 hammer2_io_bdwrite(hammer2_io_t **diop)
1068 {
1069         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1070         hammer2_io_putblk(diop);
1071 }
1072
1073 int
1074 hammer2_io_bwrite(hammer2_io_t **diop)
1075 {
1076         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1077         hammer2_io_putblk(diop);
1078         return (0);     /* XXX */
1079 }
1080
1081 void
1082 hammer2_io_setdirty(hammer2_io_t *dio)
1083 {
1084         atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
1085 }
1086
1087 /*
1088  * This routine is called when a MODIFIED chain is being DESTROYED,
1089  * in an attempt to allow the related buffer cache buffer to be
1090  * invalidated and discarded instead of flushing it to disk.
1091  *
1092  * At the moment this case is only really useful for file meta-data.
1093  * File data is already handled via the logical buffer cache associated
1094  * with the vnode, and will be discarded if it was never flushed to disk.
1095  * File meta-data may include inodes, directory entries, and indirect blocks.
1096  *
1097  * XXX
1098  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
1099  * invalidated might be smaller.  Most of the meta-data structures above
1100  * are in the 'smaller' category.  For now, don't try to invalidate the
1101  * data areas.
1102  */
1103 void
1104 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
1105 {
1106         /* NOP */
1107 }
1108
1109 void
1110 hammer2_io_brelse(hammer2_io_t **diop)
1111 {
1112         hammer2_io_putblk(diop);
1113 }
1114
1115 void
1116 hammer2_io_bqrelse(hammer2_io_t **diop)
1117 {
1118         hammer2_io_putblk(diop);
1119 }
1120
1121 int
1122 hammer2_io_isdirty(hammer2_io_t *dio)
1123 {
1124         return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1125 }
1126
1127 /*
1128  * Set dedup validation bits in a DIO.  We do not need the buffer cache
1129  * buffer for this.  This must be done concurrent with setting bits in
1130  * the freemap so as to interlock with bulkfree's clearing of those bits.
1131  */
1132 void
1133 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
1134 {
1135         hammer2_io_t *dio;
1136         int lsize;
1137
1138         dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1);
1139         lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1140         atomic_set_64(&dio->dedup_ok_mask,
1141                       hammer2_dedup_mask(dio, bref->data_off, lsize));
1142         hammer2_io_putblk(&dio);
1143 }
1144
1145 /*
1146  * Clear dedup validation bits in a DIO.  This is typically done when
1147  * a modified chain is destroyed or by the bulkfree code.  No buffer
1148  * is needed for this operation.  If the DIO no longer exists it is
1149  * equivalent to the bits not being set.
1150  */
1151 void
1152 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
1153                         hammer2_off_t data_off, u_int bytes)
1154 {
1155         hammer2_io_t *dio;
1156
1157         if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
1158                 return;
1159         if (btype != HAMMER2_BREF_TYPE_DATA)
1160                 return;
1161         dio = hammer2_io_alloc(hmp, data_off, btype, 0);
1162         if (dio) {
1163                 if (data_off < dio->pbase ||
1164                     (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
1165                     dio->pbase + dio->psize) {
1166                         panic("hammer2_dedup_delete: DATAOFF BAD "
1167                               "%016jx/%d %016jx\n",
1168                               data_off, bytes, dio->pbase);
1169                 }
1170                 atomic_clear_64(&dio->dedup_ok_mask,
1171                                 hammer2_dedup_mask(dio, data_off, bytes));
1172                 hammer2_io_putblk(&dio);
1173         }
1174 }
1175
1176 /*
1177  * Assert that dedup validation bits in a DIO are not set.  This operation
1178  * does not require a buffer.  The DIO does not need to exist.
1179  */
1180 void
1181 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
1182 {
1183         hammer2_io_t *dio;
1184
1185         dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA, 0);
1186         if (dio) {
1187                 KASSERT((dio->dedup_ok_mask &
1188                           hammer2_dedup_mask(dio, data_off, bytes)) == 0,
1189                         ("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
1190                         data_off,
1191                         bytes,
1192                         hammer2_dedup_mask(dio, data_off, bytes),
1193                         dio->dedup_ok_mask));
1194                 hammer2_io_putblk(&dio);
1195         }
1196 }
1197
1198 static
1199 void
1200 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
1201 {
1202         long *counterp;
1203
1204         if (bp->b_flags & B_DELWRI)
1205                 return;
1206
1207         switch(dio->btype) {
1208         case 0:
1209                 return;
1210         case HAMMER2_BREF_TYPE_DATA:
1211                 counterp = &hammer2_iod_file_write;
1212                 break;
1213         case HAMMER2_BREF_TYPE_DIRENT:
1214         case HAMMER2_BREF_TYPE_INODE:
1215                 counterp = &hammer2_iod_meta_write;
1216                 break;
1217         case HAMMER2_BREF_TYPE_INDIRECT:
1218                 counterp = &hammer2_iod_indr_write;
1219                 break;
1220         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1221         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1222                 counterp = &hammer2_iod_fmap_write;
1223                 break;
1224         default:
1225                 counterp = &hammer2_iod_volu_write;
1226                 break;
1227         }
1228         *counterp += dio->psize;
1229 }