hammer2 - Revamp flush and xopq mechanism, stabilization
[dragonfly.git] / sys / vfs / hammer2 / hammer2_io.c
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer2.h"
36
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 static void dio_write_stats_update(hammer2_io_t *dio);
46
47 static int
48 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 {
50         if (io1->pbase < io2->pbase)
51                 return(-1);
52         if (io1->pbase > io2->pbase)
53                 return(1);
54         return(0);
55 }
56
57 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
58 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
59                 off_t, pbase);
60
61 struct hammer2_cleanupcb_info {
62         struct hammer2_io_tree tmptree;
63         int     count;
64 };
65
66 static __inline
67 uint64_t
68 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
69 {
70         uint64_t mask;
71         int i;
72
73         if (bytes < 1024)       /* smaller chunks not supported */
74                 return 0;
75
76         /*
77          * Calculate crc check mask for larger chunks
78          */
79         i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
80              HAMMER2_PBUFMASK) >> 10;
81         if (i == 0 && bytes == HAMMER2_PBUFSIZE)
82                 return((uint64_t)-1);
83         mask = ((uint64_t)1U << (bytes >> 10)) - 1;
84         mask <<= i;
85
86         return mask;
87 }
88
89 #define HAMMER2_GETBLK_GOOD     0
90 #define HAMMER2_GETBLK_QUEUED   1
91 #define HAMMER2_GETBLK_OWNED    2
92
93 /*
94  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
95  */
96 void
97 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
98                   hammer2_iocb_t *iocb)
99 {
100         hammer2_io_t *dio;
101         hammer2_io_t *xio;
102         off_t pbase;
103         off_t pmask;
104         /*
105          * XXX after free, buffer reuse case w/ different size can clash
106          * with dio cache.  Lets avoid it for now.  Ultimate we need to
107          * invalidate the dio cache when freeing blocks to allow a mix
108          * of 16KB and 64KB block sizes).
109          */
110         /*int psize = hammer2_devblksize(lsize);*/
111         int psize = HAMMER2_PBUFSIZE;
112         uint64_t refs;
113
114         pmask = ~(hammer2_off_t)(psize - 1);
115
116         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
117         lbase &= ~HAMMER2_OFF_MASK_RADIX;
118         pbase = lbase & pmask;
119         if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
120                 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
121                         pbase, lbase, lsize, pmask);
122         }
123         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
124
125         /*
126          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
127          */
128         hammer2_spin_sh(&hmp->io_spin);
129         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
130         if (dio) {
131                 if ((atomic_fetchadd_64(&dio->refs, 1) &
132                      HAMMER2_DIO_MASK) == 0) {
133                         atomic_add_int(&dio->hmp->iofree_count, -1);
134                 }
135                 hammer2_spin_unsh(&hmp->io_spin);
136         } else {
137                 hammer2_spin_unsh(&hmp->io_spin);
138                 dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
139                 dio->hmp = hmp;
140                 dio->pbase = pbase;
141                 dio->psize = psize;
142                 dio->btype = iocb->btype;
143                 dio->refs = 1;
144                 hammer2_spin_init(&dio->spin, "h2dio");
145                 TAILQ_INIT(&dio->iocbq);
146                 hammer2_spin_ex(&hmp->io_spin);
147                 xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
148                 if (xio == NULL) {
149                         atomic_add_int(&hammer2_dio_count, 1);
150                         hammer2_spin_unex(&hmp->io_spin);
151                 } else {
152                         if ((atomic_fetchadd_64(&xio->refs, 1) &
153                              HAMMER2_DIO_MASK) == 0) {
154                                 atomic_add_int(&xio->hmp->iofree_count, -1);
155                         }
156                         hammer2_spin_unex(&hmp->io_spin);
157                         kfree(dio, M_HAMMER2);
158                         dio = xio;
159                 }
160         }
161
162         /*
163          * Obtain/Validate the buffer.
164          */
165         iocb->dio = dio;
166
167         if (dio->act < 5)       /* SMP race ok */
168                 ++dio->act;
169
170         for (;;) {
171                 refs = dio->refs;
172                 cpu_ccfence();
173
174                 /*
175                  * Issue the iocb immediately if the buffer is already good.
176                  * Once set GOOD cannot be cleared until refs drops to 0.
177                  *
178                  * lfence required because dio's are not interlocked for
179                  * the DIO_GOOD test.
180                  */
181                 if (refs & HAMMER2_DIO_GOOD) {
182                         cpu_lfence();
183                         iocb->callback(iocb);
184                         break;
185                 }
186
187                 /*
188                  * Try to own the DIO by setting INPROG so we can issue
189                  * I/O on it.
190                  */
191                 if (refs & HAMMER2_DIO_INPROG) {
192                         /*
193                          * If DIO_INPROG is already set then set WAITING and
194                          * queue the iocb.
195                          */
196                         hammer2_spin_ex(&dio->spin);
197                         if (atomic_cmpset_64(&dio->refs, refs,
198                                               refs | HAMMER2_DIO_WAITING)) {
199                                 iocb->flags |= HAMMER2_IOCB_ONQ |
200                                                HAMMER2_IOCB_INPROG;
201                                 TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
202                                 hammer2_spin_unex(&dio->spin);
203                                 break;
204                         }
205                         hammer2_spin_unex(&dio->spin);
206                         /* retry */
207                 } else {
208                         /*
209                          * If DIO_INPROG is not set then set it and issue the
210                          * callback immediately to start I/O.
211                          */
212                         if (atomic_cmpset_64(&dio->refs, refs,
213                                               refs | HAMMER2_DIO_INPROG)) {
214                                 iocb->flags |= HAMMER2_IOCB_INPROG;
215                                 iocb->callback(iocb);
216                                 break;
217                         }
218                         /* retry */
219                 }
220                 /* retry */
221         }
222 }
223
224 /*
225  * Quickly obtain a good DIO buffer, return NULL if the system no longer
226  * caches the data.
227  */
228 hammer2_io_t *
229 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
230 {
231         hammer2_iocb_t iocb;
232         hammer2_io_t *dio;
233         struct buf *bp;
234         off_t pbase;
235         off_t pmask;
236         int psize = HAMMER2_PBUFSIZE;
237         uint64_t orefs;
238         uint64_t nrefs;
239
240         pmask = ~(hammer2_off_t)(psize - 1);
241
242         KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
243         lbase &= ~HAMMER2_OFF_MASK_RADIX;
244         pbase = lbase & pmask;
245         if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
246                 kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
247                         pbase, lbase, lsize, pmask);
248         }
249         KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
250
251         /*
252          * Access/Allocate the DIO, bump dio->refs to prevent destruction.
253          */
254         hammer2_spin_sh(&hmp->io_spin);
255         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
256         if (dio == NULL) {
257                 hammer2_spin_unsh(&hmp->io_spin);
258                 return NULL;
259         }
260
261         if ((atomic_fetchadd_64(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
262                 atomic_add_int(&dio->hmp->iofree_count, -1);
263         hammer2_spin_unsh(&hmp->io_spin);
264
265         if (dio->act < 5)       /* SMP race ok */
266                 ++dio->act;
267
268         /*
269          * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
270          * the system does not have the data already cached.
271          */
272         nrefs = (uint64_t)-1;
273         for (;;) {
274                 orefs = dio->refs;
275                 cpu_ccfence();
276
277                 /*
278                  * Issue the iocb immediately if the buffer is already good.
279                  * Once set GOOD cannot be cleared until refs drops to 0.
280                  *
281                  * lfence required because dio is not interlockedf for
282                  * the DIO_GOOD test.
283                  */
284                 if (orefs & HAMMER2_DIO_GOOD) {
285                         cpu_lfence();
286                         break;
287                 }
288
289                 /*
290                  * Try to own the DIO by setting INPROG so we can issue
291                  * I/O on it.  INPROG might already be set, in which case
292                  * there is no way we can do this non-blocking so we punt.
293                  */
294                 if ((orefs & HAMMER2_DIO_INPROG))
295                         break;
296                 nrefs = orefs | HAMMER2_DIO_INPROG;
297                 if (atomic_cmpset_64(&dio->refs, orefs, nrefs) == 0)
298                         continue;
299
300                 /*
301                  * We own DIO_INPROG, try to set DIO_GOOD.
302                  *
303                  * For now do not use GETBLK_NOWAIT because 
304                  */
305                 bp = dio->bp;
306                 dio->bp = NULL;
307                 if (bp == NULL) {
308 #if 0
309                         bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
310 #endif
311                         bread(hmp->devvp, dio->pbase, dio->psize, &bp);
312                 }
313
314                 /*
315                  * System buffer must also have remained cached.
316                  */
317                 if (bp) {
318                         if ((bp->b_flags & B_ERROR) == 0 &&
319                             (bp->b_flags & B_CACHE)) {
320                                 dio->bp = bp;   /* assign BEFORE setting flag */
321                                 atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
322                         } else {
323                                 bqrelse(bp);
324                                 bp = NULL;
325                         }
326                 }
327
328                 /*
329                  * Clear DIO_INPROG.
330                  *
331                  * This is actually a bit complicated, see
332                  * hammer2_io_complete() for more information.
333                  */
334                 iocb.dio = dio;
335                 iocb.flags = HAMMER2_IOCB_INPROG;
336                 hammer2_io_complete(&iocb);
337                 break;
338         }
339
340         /*
341          * Only return the dio if its buffer is good.  If the buffer is not
342          * good be sure to clear INVALOK, meaning that invalidation is no
343          * longer acceptable
344          */
345         if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
346                 hammer2_io_putblk(&dio);
347         }
348         return dio;
349 }
350
351 /*
352  * Make sure that INVALOK is cleared on the dio associated with the specified
353  * data offset.  Called from bulkfree when a block becomes reusable.
354  */
355 void
356 hammer2_io_resetinval(hammer2_dev_t *hmp, off_t data_off)
357 {
358         hammer2_io_t *dio;
359
360         data_off &= ~HAMMER2_PBUFMASK64;
361         hammer2_spin_sh(&hmp->io_spin);
362         dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, data_off);
363         if (dio)
364                 atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALOK);
365         hammer2_spin_unsh(&hmp->io_spin);
366 }
367
368 /*
369  * The originator of the iocb is finished with it.
370  */
371 void
372 hammer2_io_complete(hammer2_iocb_t *iocb)
373 {
374         hammer2_io_t *dio = iocb->dio;
375         hammer2_iocb_t *cbtmp;
376         uint64_t orefs;
377         uint64_t nrefs;
378         uint32_t oflags;
379         uint32_t nflags;
380
381         /*
382          * If IOCB_INPROG was not set completion is synchronous due to the
383          * buffer already being good.  We can simply set IOCB_DONE and return.
384          *
385          * In this situation DIO_INPROG is not set and we have no visibility
386          * on dio->bp.  We should not try to mess with dio->bp because another
387          * thread may be finishing up its processing.  dio->bp should already
388          * be set to BUF_KERNPROC()!
389          */
390         if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
391                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
392                 return;
393         }
394
395         /*
396          * The iocb was queued, obtained DIO_INPROG, and its callback was
397          * made.  The callback is now complete.  We still own DIO_INPROG.
398          *
399          * We can set DIO_GOOD if no error occurred, which gives certain
400          * stability guarantees to dio->bp and allows other accessors to
401          * short-cut access.  DIO_GOOD cannot be cleared until the last
402          * ref is dropped.
403          */
404         KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
405         if (dio->bp) {
406                 BUF_KERNPROC(dio->bp);
407                 if ((dio->bp->b_flags & B_ERROR) == 0) {
408                         KKASSERT(dio->bp->b_flags & B_CACHE);
409                         atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
410                 }
411         }
412
413         /*
414          * Clean up the dio before marking the iocb as being done.  If another
415          * iocb is pending we chain to it while leaving DIO_INPROG set (it
416          * will call io completion and presumably clear DIO_INPROG).
417          *
418          * Otherwise if no other iocbs are pending we clear DIO_INPROG before
419          * finishing up the cbio.  This means that DIO_INPROG is cleared at
420          * the end of the chain before ANY of the cbios are marked done.
421          *
422          * NOTE: The TAILQ is not stable until the spin-lock is held.
423          */
424         for (;;) {
425                 orefs = dio->refs;
426                 nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
427
428                 if (orefs & HAMMER2_DIO_WAITING) {
429                         hammer2_spin_ex(&dio->spin);
430                         cbtmp = TAILQ_FIRST(&dio->iocbq);
431                         if (cbtmp) {
432                                 /*
433                                  * NOTE: flags not adjusted in this case.
434                                  *       Flags will be adjusted by the last
435                                  *       iocb.
436                                  */
437                                 TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
438                                 hammer2_spin_unex(&dio->spin);
439                                 cbtmp->callback(cbtmp); /* chained */
440                                 break;
441                         } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
442                                 hammer2_spin_unex(&dio->spin);
443                                 break;
444                         }
445                         hammer2_spin_unex(&dio->spin);
446                         /* retry */
447                 } else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
448                         break;
449                 } /* else retry */
450                 /* retry */
451         }
452
453         /*
454          * Mark the iocb as done and wakeup any waiters.  This is done after
455          * all iocb chains have been called back and after DIO_INPROG has been
456          * cleared.  This avoids races against ref count drops by the waiting
457          * threads (a hard but not impossible SMP race) which might result in
458          * a 1->0 transition of the refs while DIO_INPROG is still set.
459          */
460         for (;;) {
461                 oflags = iocb->flags;
462                 cpu_ccfence();
463                 nflags = oflags;
464                 nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
465                 nflags |= HAMMER2_IOCB_DONE;
466
467                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
468                         if (oflags & HAMMER2_IOCB_WAKEUP)
469                                 wakeup(iocb);
470                         /* SMP: iocb is now stale */
471                         break;
472                 }
473                 /* retry */
474         }
475         iocb = NULL;
476
477 }
478
479 /*
480  * Wait for an iocb's I/O to finish.
481  */
482 void
483 hammer2_iocb_wait(hammer2_iocb_t *iocb)
484 {
485         uint32_t oflags;
486         uint32_t nflags;
487
488         for (;;) {
489                 oflags = iocb->flags;
490                 cpu_ccfence();
491                 nflags = oflags | HAMMER2_IOCB_WAKEUP;
492                 if (oflags & HAMMER2_IOCB_DONE)
493                         break;
494                 tsleep_interlock(iocb, 0);
495                 if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
496                         tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
497                 }
498         }
499
500 }
501
502 /*
503  * Release our ref on *diop.
504  *
505  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
506  * then dispose of the underlying buffer.
507  */
508 void
509 hammer2_io_putblk(hammer2_io_t **diop)
510 {
511         hammer2_dev_t *hmp;
512         hammer2_io_t *dio;
513         hammer2_iocb_t iocb;
514         struct buf *bp;
515         off_t peof;
516         off_t pbase;
517         int psize;
518         uint64_t orefs;
519         uint64_t nrefs;
520
521         dio = *diop;
522         *diop = NULL;
523         hmp = dio->hmp;
524
525         while (dio->unused01) {
526                 tsleep(&dio->unused01, 0, "h2DEBUG", hz);
527         }
528
529         /*
530          * Drop refs.
531          *
532          * On the 1->0 transition clear flags and set INPROG.
533          *
534          * On the 1->0 transition if INPROG is already set, another thread
535          * is in lastdrop and we can just return after the transition.
536          *
537          * On any other transition we can generally just return.
538          */
539         for (;;) {
540                 orefs = dio->refs;
541                 cpu_ccfence();
542                 nrefs = orefs - 1;
543
544                 if ((orefs & HAMMER2_DIO_MASK) == 1 &&
545                     (orefs & HAMMER2_DIO_INPROG) == 0) {
546                         /*
547                          * Lastdrop case, INPROG can be set.
548                          */
549                         nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
550                         nrefs &= ~(HAMMER2_DIO_INVAL);
551                         nrefs |= HAMMER2_DIO_INPROG;
552                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
553                                 break;
554                 } else if ((orefs & HAMMER2_DIO_MASK) == 1) {
555                         /*
556                          * Lastdrop case, INPROG already set.
557                          */
558                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
559                                 atomic_add_int(&hmp->iofree_count, 1);
560                                 return;
561                         }
562                 } else {
563                         /*
564                          * Normal drop case.
565                          */
566                         if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
567                                 return;
568                 }
569                 cpu_pause();
570                 /* retry */
571         }
572
573         /*
574          * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
575          * have been cleared.
576          *
577          * We can now dispose of the buffer, and should do it before calling
578          * io_complete() in case there's a race against a new reference
579          * which causes io_complete() to chain and instantiate the bp again.
580          */
581         pbase = dio->pbase;
582         psize = dio->psize;
583         bp = dio->bp;
584         dio->bp = NULL;
585
586         if (orefs & HAMMER2_DIO_GOOD) {
587                 KKASSERT(bp != NULL);
588 #if 1
589                 if (hammer2_inval_enable &&
590                     (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
591                         ++hammer2_iod_invals;
592                         bp->b_flags |= B_INVAL | B_RELBUF;
593                         brelse(bp);
594                 } else
595 #endif
596                 if (orefs & HAMMER2_DIO_DIRTY) {
597                         int hce;
598
599                         dio_write_stats_update(dio);
600                         if ((hce = hammer2_cluster_write) > 0) {
601                                 /*
602                                  * Allows write-behind to keep the buffer
603                                  * cache sane.
604                                  */
605                                 peof = (pbase + HAMMER2_SEGMASK64) &
606                                        ~HAMMER2_SEGMASK64;
607                                 bp->b_flags |= B_CLUSTEROK;
608                                 cluster_write(bp, peof, psize, hce);
609                         } else {
610                                 /*
611                                  * Allows dirty buffers to accumulate and
612                                  * possibly be canceled (e.g. by a 'rm'),
613                                  * will burst-write later.
614                                  */
615                                 bp->b_flags |= B_CLUSTEROK;
616                                 bdwrite(bp);
617                         }
618                 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
619                         brelse(bp);
620                 } else {
621                         bqrelse(bp);
622                 }
623         } else if (bp) {
624 #if 1
625                 if (hammer2_inval_enable &&
626                     (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
627                         ++hammer2_iod_invals;
628                         bp->b_flags |= B_INVAL | B_RELBUF;
629                         brelse(bp);
630                 } else
631 #endif
632                 if (orefs & HAMMER2_DIO_DIRTY) {
633                         dio_write_stats_update(dio);
634                         bdwrite(bp);
635                 } else {
636                         brelse(bp);
637                 }
638         }
639
640         /*
641          * The instant we call io_complete dio is a free agent again and
642          * can be ripped out from under us.
643          *
644          * we can cleanup our final DIO_INPROG by simulating an iocb
645          * completion.
646          */
647         hmp = dio->hmp;                         /* extract fields */
648         atomic_add_int(&hmp->iofree_count, 1);
649         cpu_ccfence();
650
651         iocb.dio = dio;
652         iocb.flags = HAMMER2_IOCB_INPROG;
653         hammer2_io_complete(&iocb);
654         dio = NULL;                             /* dio stale */
655
656         /*
657          * We cache free buffers so re-use cases can use a shared lock, but
658          * if too many build up we have to clean them out.
659          */
660         if (hmp->iofree_count > 65536) {
661                 struct hammer2_cleanupcb_info info;
662
663                 RB_INIT(&info.tmptree);
664                 hammer2_spin_ex(&hmp->io_spin);
665                 if (hmp->iofree_count > 65536) {
666                         info.count = hmp->iofree_count / 4;
667                         RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
668                                 hammer2_io_cleanup_callback, &info);
669                 }
670                 hammer2_spin_unex(&hmp->io_spin);
671                 hammer2_io_cleanup(hmp, &info.tmptree);
672         }
673 }
674
675 /*
676  * Cleanup any dio's with (INPROG | refs) == 0.
677  *
678  * Called to clean up cached DIOs on umount after all activity has been
679  * flushed.
680  */
681 static
682 int
683 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
684 {
685         struct hammer2_cleanupcb_info *info = arg;
686         hammer2_io_t *xio;
687
688         if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
689                 if (dio->act > 0) {
690                         --dio->act;
691                         return 0;
692                 }
693                 KKASSERT(dio->bp == NULL);
694                 RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
695                 xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
696                 KKASSERT(xio == NULL);
697                 if (--info->count <= 0) /* limit scan */
698                         return(-1);
699         }
700         return 0;
701 }
702
703 void
704 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
705 {
706         hammer2_io_t *dio;
707
708         while ((dio = RB_ROOT(tree)) != NULL) {
709                 RB_REMOVE(hammer2_io_tree, tree, dio);
710                 KKASSERT(dio->bp == NULL &&
711                     (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
712                 kfree(dio, M_HAMMER2);
713                 atomic_add_int(&hammer2_dio_count, -1);
714                 atomic_add_int(&hmp->iofree_count, -1);
715         }
716 }
717
718 /*
719  * Returns a pointer to the requested data.
720  */
721 char *
722 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
723 {
724         struct buf *bp;
725         int off;
726
727         bp = dio->bp;
728         KKASSERT(bp != NULL);
729         off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
730         KKASSERT(off >= 0 && off < bp->b_bufsize);
731         return(bp->b_data + off);
732 }
733
734 #if 0
735 /*
736  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
737  * in the chain structure, but chain structure needs to be persistent as
738  * well on refs=0 and it isn't.
739  */
740 int
741 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
742 {
743         hammer2_io_t *dio;
744         uint64_t mask;
745
746         if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
747                 mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
748                 *maskp = mask;
749                 if ((dio->crc_good_mask & mask) == mask)
750                         return 1;
751                 return 0;
752         }
753         *maskp = 0;
754
755         return 0;
756 }
757
758 void
759 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
760 {
761         if (dio) {
762                 if (sizeof(long) == 8) {
763                         atomic_set_long(&dio->crc_good_mask, mask);
764                 } else {
765 #if _BYTE_ORDER == _LITTLE_ENDIAN
766                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
767                                         (uint32_t)mask);
768                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
769                                         (uint32_t)(mask >> 32));
770 #else
771                         atomic_set_int(&((int *)&dio->crc_good_mask)[0],
772                                         (uint32_t)(mask >> 32));
773                         atomic_set_int(&((int *)&dio->crc_good_mask)[1],
774                                         (uint32_t)mask);
775 #endif
776                 }
777         }
778 }
779
780 void
781 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
782 {
783         if (dio) {
784                 if (sizeof(long) == 8) {
785                         atomic_clear_long(&dio->crc_good_mask, mask);
786                 } else {
787 #if _BYTE_ORDER == _LITTLE_ENDIAN
788                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
789                                         (uint32_t)mask);
790                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
791                                         (uint32_t)(mask >> 32));
792 #else
793                         atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
794                                         (uint32_t)(mask >> 32));
795                         atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
796                                         (uint32_t)mask);
797 #endif
798                 }
799         }
800 }
801 #endif
802
803 /*
804  * Helpers for hammer2_io_new*() functions
805  */
806 static
807 void
808 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
809 {
810         hammer2_io_t *dio = iocb->dio;
811         int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
812
813         /*
814          * If IOCB_INPROG is not set the dio already has a good buffer and we
815          * can't mess with it other than zero the requested range.
816          *
817          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
818          * do what needs to be done with dio->bp.
819          */
820         if (iocb->flags & HAMMER2_IOCB_INPROG) {
821                 if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
822                         if (iocb->lsize == dio->psize) {
823                                 /*
824                                  * Fully covered buffer, try to optimize to
825                                  * avoid any I/O.  We might already have the
826                                  * buffer due to iocb chaining.
827                                  */
828                                 if (dio->bp == NULL) {
829                                         dio->bp = getblk(dio->hmp->devvp,
830                                                          dio->pbase, dio->psize,
831                                                          gbctl, 0);
832                                 }
833                                 if (dio->bp) {
834                                         vfs_bio_clrbuf(dio->bp);
835                                         dio->bp->b_flags |= B_CACHE;
836                                 }
837
838                                 /*
839                                  * Invalidation is ok on newly allocated
840                                  * buffers which cover the entire buffer.
841                                  * Flag will be cleared on use by the de-dup
842                                  * code.
843                                  *
844                                  * hammer2_chain_modify() also checks this flag.
845                                  *
846                                  * QUICK mode is used by the freemap code to
847                                  * pre-validate a junk buffer to prevent an
848                                  * unnecessary read I/O.  We do NOT want
849                                  * to set INVALOK in that situation as the
850                                  * underlying allocations may be smaller.
851                                  */
852                                 if ((iocb->flags & HAMMER2_IOCB_QUICK) == 0) {
853                                         atomic_set_64(&dio->refs,
854                                                       HAMMER2_DIO_INVALOK);
855                                 }
856                         } else if (iocb->flags & HAMMER2_IOCB_QUICK) {
857                                 /*
858                                  * Partial buffer, quick mode.  Do nothing.
859                                  * Do not instantiate the buffer or try to
860                                  * mark it B_CACHE because other portions of
861                                  * the buffer might have to be read by other
862                                  * accessors.
863                                  */
864                         } else if (dio->bp == NULL ||
865                                    (dio->bp->b_flags & B_CACHE) == 0) {
866                                 /*
867                                  * Partial buffer, normal mode, requires
868                                  * read-before-write.  Chain the read.
869                                  *
870                                  * We might already have the buffer due to
871                                  * iocb chaining.  XXX unclear if we really
872                                  * need to write/release it and reacquire
873                                  * in that case.
874                                  *
875                                  * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
876                                  */
877                                 if (dio->bp) {
878                                         if (dio->refs & HAMMER2_DIO_DIRTY) {
879                                                 dio_write_stats_update(dio);
880                                                 bdwrite(dio->bp);
881                                         } else {
882                                                 bqrelse(dio->bp);
883                                         }
884                                         dio->bp = NULL;
885                                 }
886                                 atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
887                                 breadcb(dio->hmp->devvp,
888                                         dio->pbase, dio->psize,
889                                         hammer2_io_callback, iocb);
890                                 return;
891                         } /* else buffer is good */
892                 } /* else callback from breadcb is complete */
893         }
894         if (dio->bp) {
895                 if (iocb->flags & HAMMER2_IOCB_ZERO)
896                         bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
897                 atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
898         }
899         hammer2_io_complete(iocb);
900 }
901
902 static
903 int
904 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
905                 hammer2_io_t **diop, int flags)
906 {
907         hammer2_iocb_t iocb;
908
909         iocb.callback = hammer2_iocb_new_callback;
910         iocb.cluster = NULL;
911         iocb.chain = NULL;
912         iocb.ptr = NULL;
913         iocb.lbase = lbase;
914         iocb.lsize = lsize;
915         iocb.flags = flags;
916         iocb.btype = btype;
917         iocb.error = 0;
918         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
919         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
920                 hammer2_iocb_wait(&iocb);
921         *diop = iocb.dio;
922
923         return (iocb.error);
924 }
925
926 int
927 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
928                hammer2_io_t **diop)
929 {
930         return(_hammer2_io_new(hmp, btype, lbase, lsize,
931                                diop, HAMMER2_IOCB_ZERO));
932 }
933
934 int
935 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
936                  hammer2_io_t **diop)
937 {
938         return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
939 }
940
941 /*
942  * This is called from the freemap to pre-validate a full-sized buffer
943  * whos contents we don't care about, in order to prevent an unnecessary
944  * read-before-write.
945  */
946 void
947 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize)
948 {
949         hammer2_io_t *dio = NULL;
950
951         _hammer2_io_new(hmp, btype, lbase, lsize, &dio, HAMMER2_IOCB_QUICK);
952         hammer2_io_bqrelse(&dio);
953 }
954
955 static
956 void
957 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
958 {
959         hammer2_io_t *dio = iocb->dio;
960         off_t peof;
961         int error;
962
963         /*
964          * If IOCB_INPROG is not set the dio already has a good buffer and we
965          * can't mess with it other than zero the requested range.
966          *
967          * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
968          * do what needs to be done with dio->bp.
969          */
970         if (iocb->flags & HAMMER2_IOCB_INPROG) {
971                 int hce;
972
973                 if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
974                         /*
975                          * Already good, likely due to being chained from
976                          * another iocb.
977                          */
978                         error = 0;
979                 } else if ((hce = hammer2_cluster_read) > 0) {
980                         /*
981                          * Synchronous cluster I/O for now.
982                          */
983                         if (dio->bp) {
984                                 bqrelse(dio->bp);
985                                 dio->bp = NULL;
986                         }
987                         peof = (dio->pbase + HAMMER2_SEGMASK64) &
988                                ~HAMMER2_SEGMASK64;
989                         error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
990                                              dio->psize,
991                                              dio->psize, HAMMER2_PBUFSIZE*hce,
992                                              &dio->bp);
993                 } else {
994                         /*
995                          * Synchronous I/O for now.
996                          */
997                         if (dio->bp) {
998                                 bqrelse(dio->bp);
999                                 dio->bp = NULL;
1000                         }
1001                         error = bread(dio->hmp->devvp, dio->pbase,
1002                                       dio->psize, &dio->bp);
1003                 }
1004                 if (error) {
1005                         brelse(dio->bp);
1006                         dio->bp = NULL;
1007                 }
1008         }
1009         hammer2_io_complete(iocb);
1010 }
1011
1012 int
1013 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
1014                 hammer2_io_t **diop)
1015 {
1016         hammer2_iocb_t iocb;
1017
1018         iocb.callback = hammer2_iocb_bread_callback;
1019         iocb.cluster = NULL;
1020         iocb.chain = NULL;
1021         iocb.ptr = NULL;
1022         iocb.lbase = lbase;
1023         iocb.lsize = lsize;
1024         iocb.btype = btype;
1025         iocb.flags = 0;
1026         iocb.error = 0;
1027         hammer2_io_getblk(hmp, lbase, lsize, &iocb);
1028         if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
1029                 hammer2_iocb_wait(&iocb);
1030         *diop = iocb.dio;
1031
1032         return (iocb.error);
1033 }
1034
1035 /*
1036  * System buf/bio async callback extracts the iocb and chains
1037  * to the iocb callback.
1038  */
1039 void
1040 hammer2_io_callback(struct bio *bio)
1041 {
1042         struct buf *dbp = bio->bio_buf;
1043         hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
1044         hammer2_io_t *dio;
1045
1046         dio = iocb->dio;
1047         if ((bio->bio_flags & BIO_DONE) == 0)
1048                 bpdone(dbp, 0);
1049         bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1050         dio->bp = bio->bio_buf;
1051         iocb->callback(iocb);
1052 }
1053
1054 void
1055 hammer2_io_bawrite(hammer2_io_t **diop)
1056 {
1057         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1058         hammer2_io_putblk(diop);
1059 }
1060
1061 void
1062 hammer2_io_bdwrite(hammer2_io_t **diop)
1063 {
1064         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1065         hammer2_io_putblk(diop);
1066 }
1067
1068 int
1069 hammer2_io_bwrite(hammer2_io_t **diop)
1070 {
1071         atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1072         hammer2_io_putblk(diop);
1073         return (0);     /* XXX */
1074 }
1075
1076 void
1077 hammer2_io_setdirty(hammer2_io_t *dio)
1078 {
1079         atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
1080 }
1081
1082 /*
1083  * Request an invalidation.  The hammer2_io code will oblige only if
1084  * DIO_INVALOK is also set.  INVALOK is cleared if the dio is used
1085  * in a dedup lookup and prevents invalidation of the dirty buffer.
1086  */
1087 void
1088 hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
1089 {
1090         if ((u_int)dio->psize == bytes)
1091                 atomic_set_64(&dio->refs, HAMMER2_DIO_INVAL);
1092 }
1093
1094 void
1095 hammer2_io_brelse(hammer2_io_t **diop)
1096 {
1097         hammer2_io_putblk(diop);
1098 }
1099
1100 void
1101 hammer2_io_bqrelse(hammer2_io_t **diop)
1102 {
1103         hammer2_io_putblk(diop);
1104 }
1105
1106 int
1107 hammer2_io_isdirty(hammer2_io_t *dio)
1108 {
1109         return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1110 }
1111
1112 static
1113 void
1114 dio_write_stats_update(hammer2_io_t *dio)
1115 {
1116         long *counterp;
1117
1118         switch(dio->btype) {
1119         case 0:
1120                 return;
1121         case HAMMER2_BREF_TYPE_DATA:
1122                 counterp = &hammer2_iod_file_write;
1123                 break;
1124         case HAMMER2_BREF_TYPE_INODE:
1125                 counterp = &hammer2_iod_meta_write;
1126                 break;
1127         case HAMMER2_BREF_TYPE_INDIRECT:
1128                 counterp = &hammer2_iod_indr_write;
1129                 break;
1130         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1131         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1132                 counterp = &hammer2_iod_fmap_write;
1133                 break;
1134         default:
1135                 counterp = &hammer2_iod_volu_write;
1136                 break;
1137         }
1138         *counterp += dio->psize;
1139 }