hammer2 - stabilization - Fix bugs found by blogbench
[dragonfly.git] / sys / vfs / hammer2 / hammer2_strategy.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  *
40  * [De]compression, zero-block, check codes, and buffer cache operations
41  * for file data is handled here.
42  *
43  * Live dedup makes its home here as well.
44  */
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/buf.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/mount.h>
54 #include <sys/vnode.h>
55 #include <sys/mountctl.h>
56 #include <sys/dirent.h>
57 #include <sys/uio.h>
58 #include <sys/objcache.h>
59 #include <sys/event.h>
60 #include <sys/file.h>
61 #include <vfs/fifofs/fifo.h>
62
63 #include "hammer2.h"
64 #include "hammer2_lz4.h"
65
66 #include "zlib/hammer2_zlib.h"
67
68 struct objcache *cache_buffer_read;
69 struct objcache *cache_buffer_write;
70
71 /*
72  * Strategy code (async logical file buffer I/O from system)
73  *
74  * WARNING: The strategy code cannot safely use hammer2 transactions
75  *          as this can deadlock against vfs_sync's vfsync() call
76  *          if multiple flushes are queued.  All H2 structures must
77  *          already be present and ready for the DIO.
78  *
79  *          Reads can be initiated asynchronously, writes have to be
80  *          spooled to a separate thread for action to avoid deadlocks.
81  */
82 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
83 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
84 static int hammer2_strategy_read(struct vop_strategy_args *ap);
85 static int hammer2_strategy_write(struct vop_strategy_args *ap);
86 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
87                                 char *data, struct bio *bio);
88
89 static void hammer2_dedup_record(hammer2_chain_t *chain, char *data);
90 static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp,
91                         char **datap, int pblksize);
92
93 int
94 hammer2_vop_strategy(struct vop_strategy_args *ap)
95 {
96         struct bio *biop;
97         struct buf *bp;
98         int error;
99
100         biop = ap->a_bio;
101         bp = biop->bio_buf;
102
103         switch(bp->b_cmd) {
104         case BUF_CMD_READ:
105                 error = hammer2_strategy_read(ap);
106                 ++hammer2_iod_file_read;
107                 break;
108         case BUF_CMD_WRITE:
109                 error = hammer2_strategy_write(ap);
110                 ++hammer2_iod_file_write;
111                 break;
112         default:
113                 bp->b_error = error = EINVAL;
114                 bp->b_flags |= B_ERROR;
115                 biodone(biop);
116                 break;
117         }
118         return (error);
119 }
120
121 /*
122  * Return the largest contiguous physical disk range for the logical
123  * request, in bytes.
124  *
125  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
126  *
127  * Basically disabled, the logical buffer write thread has to deal with
128  * buffers one-at-a-time.
129  */
130 int
131 hammer2_vop_bmap(struct vop_bmap_args *ap)
132 {
133         *ap->a_doffsetp = NOOFFSET;
134         if (ap->a_runp)
135                 *ap->a_runp = 0;
136         if (ap->a_runb)
137                 *ap->a_runb = 0;
138         return (EOPNOTSUPP);
139 }
140
141 /****************************************************************************
142  *                              READ SUPPORT                                *
143  ****************************************************************************/
144 /* 
145  * Callback used in read path in case that a block is compressed with LZ4.
146  */
147 static
148 void
149 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
150 {
151         struct buf *bp;
152         char *compressed_buffer;
153         int compressed_size;
154         int result;
155
156         bp = bio->bio_buf;
157
158 #if 0
159         if bio->bio_caller_info2.index &&
160               bio->bio_caller_info1.uvalue32 !=
161               crc32(bp->b_data, bp->b_bufsize) --- return error
162 #endif
163
164         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
165         compressed_size = *(const int *)data;
166         KKASSERT(compressed_size <= bytes - sizeof(int));
167
168         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
169         result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
170                                      compressed_buffer,
171                                      compressed_size,
172                                      bp->b_bufsize);
173         if (result < 0) {
174                 kprintf("READ PATH: Error during decompression."
175                         "bio %016jx/%d\n",
176                         (intmax_t)bio->bio_offset, bytes);
177                 /* make sure it isn't random garbage */
178                 bzero(compressed_buffer, bp->b_bufsize);
179         }
180         KKASSERT(result <= bp->b_bufsize);
181         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
182         if (result < bp->b_bufsize)
183                 bzero(bp->b_data + result, bp->b_bufsize - result);
184         objcache_put(cache_buffer_read, compressed_buffer);
185         bp->b_resid = 0;
186         bp->b_flags |= B_AGE;
187 }
188
189 /*
190  * Callback used in read path in case that a block is compressed with ZLIB.
191  * It is almost identical to LZ4 callback, so in theory they can be unified,
192  * but we didn't want to make changes in bio structure for that.
193  */
194 static
195 void
196 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
197 {
198         struct buf *bp;
199         char *compressed_buffer;
200         z_stream strm_decompress;
201         int result;
202         int ret;
203
204         bp = bio->bio_buf;
205
206         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
207         strm_decompress.avail_in = 0;
208         strm_decompress.next_in = Z_NULL;
209
210         ret = inflateInit(&strm_decompress);
211
212         if (ret != Z_OK)
213                 kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
214
215         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
216         strm_decompress.next_in = __DECONST(char *, data);
217
218         /* XXX supply proper size, subset of device bp */
219         strm_decompress.avail_in = bytes;
220         strm_decompress.next_out = compressed_buffer;
221         strm_decompress.avail_out = bp->b_bufsize;
222
223         ret = inflate(&strm_decompress, Z_FINISH);
224         if (ret != Z_STREAM_END) {
225                 kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
226                 bzero(compressed_buffer, bp->b_bufsize);
227         }
228         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
229         result = bp->b_bufsize - strm_decompress.avail_out;
230         if (result < bp->b_bufsize)
231                 bzero(bp->b_data + result, strm_decompress.avail_out);
232         objcache_put(cache_buffer_read, compressed_buffer);
233         ret = inflateEnd(&strm_decompress);
234
235         bp->b_resid = 0;
236         bp->b_flags |= B_AGE;
237 }
238
239 /*
240  * Logical buffer I/O, async read.
241  */
242 static
243 int
244 hammer2_strategy_read(struct vop_strategy_args *ap)
245 {
246         hammer2_xop_strategy_t *xop;
247         struct buf *bp;
248         struct bio *bio;
249         struct bio *nbio;
250         hammer2_inode_t *ip;
251         hammer2_key_t lbase;
252
253         bio = ap->a_bio;
254         bp = bio->bio_buf;
255         ip = VTOI(ap->a_vp);
256         nbio = push_bio(bio);
257
258         lbase = bio->bio_offset;
259         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
260
261         xop = hammer2_xop_alloc(ip, 0);
262         xop->finished = 0;
263         xop->bio = bio;
264         xop->lbase = lbase;
265         hammer2_mtx_init(&xop->lock, "h2bio");
266         hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
267         /* asynchronous completion */
268
269         return(0);
270 }
271
272 /*
273  * Per-node XOP (threaded), do a synchronous lookup of the chain and
274  * its data.  The frontend is asynchronous, so we are also responsible
275  * for racing to terminate the frontend.
276  */
277 static
278 void
279 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
280 {
281         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
282         hammer2_chain_t *parent;
283         hammer2_chain_t *chain;
284         hammer2_key_t key_dummy;
285         hammer2_key_t lbase;
286         struct bio *bio;
287         struct buf *bp;
288         int cache_index = -1;
289         int error;
290
291         lbase = xop->lbase;
292         bio = xop->bio;
293         bp = bio->bio_buf;
294
295         parent = hammer2_inode_chain(xop->head.ip1, clindex,
296                                      HAMMER2_RESOLVE_ALWAYS |
297                                      HAMMER2_RESOLVE_SHARED);
298         if (parent) {
299                 chain = hammer2_chain_lookup(&parent, &key_dummy,
300                                              lbase, lbase,
301                                              &cache_index,
302                                              HAMMER2_LOOKUP_ALWAYS |
303                                              HAMMER2_LOOKUP_SHARED);
304                 error = chain ? chain->error : 0;
305         } else {
306                 error = EIO;
307                 chain = NULL;
308         }
309         error = hammer2_xop_feed(&xop->head, chain, clindex, error);
310         if (chain)
311                 hammer2_chain_drop(chain);
312         if (parent) {
313                 hammer2_chain_unlock(parent);
314                 hammer2_chain_drop(parent);
315         }
316         chain = NULL;   /* safety */
317         parent = NULL;  /* safety */
318
319         /*
320          * Race to finish the frontend
321          */
322         if (xop->finished)
323                 return;
324         hammer2_mtx_ex(&xop->lock);
325         if (xop->finished) {
326                 hammer2_mtx_unlock(&xop->lock);
327                 return;
328         }
329
330         /*
331          * Async operation has not completed and we now own the lock.
332          * Determine if we can complete the operation by issuing the
333          * frontend collection non-blocking.
334          */
335         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
336
337         switch(error) {
338         case 0:
339                 xop->finished = 1;
340                 hammer2_mtx_unlock(&xop->lock);
341                 chain = xop->head.cluster.focus;
342                 hammer2_strategy_read_completion(chain, (char *)chain->data,
343                                                  xop->bio);
344                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
345                 biodone(bio);
346                 break;
347         case ENOENT:
348                 xop->finished = 1;
349                 hammer2_mtx_unlock(&xop->lock);
350                 bp->b_resid = 0;
351                 bp->b_error = 0;
352                 bzero(bp->b_data, bp->b_bcount);
353                 biodone(bio);
354                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
355                 break;
356         case EINPROGRESS:
357                 hammer2_mtx_unlock(&xop->lock);
358                 break;
359         default:
360                 xop->finished = 1;
361                 hammer2_mtx_unlock(&xop->lock);
362                 bp->b_flags |= B_ERROR;
363                 bp->b_error = EIO;
364                 biodone(bio);
365                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
366                 break;
367         }
368 }
369
370 static
371 void
372 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
373                                  struct bio *bio)
374 {
375         struct buf *bp = bio->bio_buf;
376
377         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
378                 /*
379                  * Data is embedded in the inode (copy from inode).
380                  */
381                 bcopy(((hammer2_inode_data_t *)data)->u.data,
382                       bp->b_data, HAMMER2_EMBEDDED_BYTES);
383                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
384                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
385                 bp->b_resid = 0;
386                 bp->b_error = 0;
387         } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
388                 /*
389                  * Data is on-media, record for live dedup.
390                  */
391                 hammer2_dedup_record(chain, data);
392
393                 /*
394                  * Decopmression and copy.
395                  */
396                 switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
397                 case HAMMER2_COMP_LZ4:
398                         hammer2_decompress_LZ4_callback(data, chain->bytes,
399                                                         bio);
400                         break;
401                 case HAMMER2_COMP_ZLIB:
402                         hammer2_decompress_ZLIB_callback(data, chain->bytes,
403                                                          bio);
404                         break;
405                 case HAMMER2_COMP_NONE:
406                         KKASSERT(chain->bytes <= bp->b_bcount);
407                         bcopy(data, bp->b_data, chain->bytes);
408                         if (chain->bytes < bp->b_bcount) {
409                                 bzero(bp->b_data + chain->bytes,
410                                       bp->b_bcount - chain->bytes);
411                         }
412                         bp->b_flags |= B_NOTMETA;
413                         bp->b_resid = 0;
414                         bp->b_error = 0;
415                         break;
416                 default:
417                         panic("hammer2_strategy_read: "
418                               "unknown compression type");
419                 }
420         } else {
421                 panic("hammer2_strategy_read: unknown bref type");
422         }
423 }
424
425 /****************************************************************************
426  *                              WRITE SUPPORT                               *
427  ****************************************************************************/
428
429 /* 
430  * Functions for compression in threads,
431  * from hammer2_vnops.c
432  */
433 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
434                                 hammer2_chain_t **parentp,
435                                 hammer2_key_t lbase, int ioflag, int pblksize,
436                                 hammer2_tid_t mtid, int *errorp);
437 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
438                                 hammer2_chain_t **parentp,
439                                 hammer2_key_t lbase, int ioflag, int pblksize,
440                                 hammer2_tid_t mtid, int *errorp,
441                                 int comp_algo, int check_algo);
442 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
443                                 hammer2_chain_t **parentp,
444                                 hammer2_key_t lbase, int ioflag, int pblksize,
445                                 hammer2_tid_t mtid, int *errorp,
446                                 int check_algo);
447 static int test_block_zeros(const char *buf, size_t bytes);
448 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
449                                 hammer2_chain_t **parentp,
450                                 hammer2_key_t lbase,
451                                 hammer2_tid_t mtid, int *errorp);
452 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
453                                 int ioflag, int pblksize,
454                                 hammer2_tid_t mtid, int *errorp,
455                                 int check_algo);
456
457 static
458 int
459 hammer2_strategy_write(struct vop_strategy_args *ap)
460 {       
461         hammer2_xop_strategy_t *xop;
462         hammer2_pfs_t *pmp;
463         struct bio *bio;
464         struct buf *bp;
465         hammer2_inode_t *ip;
466         
467         bio = ap->a_bio;
468         bp = bio->bio_buf;
469         ip = VTOI(ap->a_vp);
470         pmp = ip->pmp;
471         
472         hammer2_lwinprog_ref(pmp);
473         hammer2_trans_assert_strategy(pmp);
474
475         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
476         xop->finished = 0;
477         xop->bio = bio;
478         xop->lbase = bio->bio_offset;
479         hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
480         /* asynchronous completion */
481
482         hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
483
484         return(0);
485 }
486
487 /*
488  * Per-node XOP (threaded).  Write the logical buffer to the media.
489  */
490 static
491 void
492 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
493 {
494         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
495         hammer2_chain_t *parent;
496         hammer2_key_t lbase;
497         hammer2_inode_t *ip;
498         struct bio *bio;
499         struct buf *bp;
500         int error;
501         int lblksize;
502         int pblksize;
503
504         lbase = xop->lbase;
505         bio = xop->bio;
506         bp = bio->bio_buf;
507         ip = xop->head.ip1;
508
509         /* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
510
511         lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
512         pblksize = hammer2_calc_physical(ip, lbase);
513         parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
514         hammer2_write_file_core(bp, ip, &parent,
515                                 lbase, IO_ASYNC, pblksize,
516                                 xop->head.mtid, &error);
517         if (parent) {
518                 hammer2_chain_unlock(parent);
519                 hammer2_chain_drop(parent);
520                 parent = NULL;  /* safety */
521         }
522         error = hammer2_xop_feed(&xop->head, NULL, clindex, error);
523
524         /*
525          * Race to finish the frontend
526          */
527         if (xop->finished)
528                 return;
529         hammer2_mtx_ex(&xop->lock);
530         if (xop->finished) {
531                 hammer2_mtx_unlock(&xop->lock);
532                 return;
533         }
534
535         /*
536          * Async operation has not completed and we now own the lock.
537          * Determine if we can complete the operation by issuing the
538          * frontend collection non-blocking.
539          */
540         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
541
542         switch(error) {
543         case ENOENT:
544         case 0:
545                 xop->finished = 1;
546                 hammer2_mtx_unlock(&xop->lock);
547                 bp->b_resid = 0;
548                 bp->b_error = 0;
549                 biodone(bio);
550                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
551                 hammer2_lwinprog_drop(ip->pmp);
552                 break;
553         case EINPROGRESS:
554                 hammer2_mtx_unlock(&xop->lock);
555                 break;
556         default:
557                 xop->finished = 1;
558                 hammer2_mtx_unlock(&xop->lock);
559                 bp->b_flags |= B_ERROR;
560                 bp->b_error = EIO;
561                 biodone(bio);
562                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
563                 hammer2_lwinprog_drop(ip->pmp);
564                 break;
565         }
566 }
567
568 /*
569  * Wait for pending I/O to complete
570  */
571 void
572 hammer2_bioq_sync(hammer2_pfs_t *pmp)
573 {
574         hammer2_lwinprog_wait(pmp, 0);
575 }
576
577 /* 
578  * Create a new cluster at (cparent, lbase) and assign physical storage,
579  * returning a cluster suitable for I/O.  The cluster will be in a modified
580  * state.
581  *
582  * cparent can wind up being anything.
583  *
584  * If datap is not NULL, *datap points to the real data we intend to write.
585  * If we can dedup the storage location we set *datap to NULL to indicate
586  * to the caller that a dedup occurred.
587  *
588  * NOTE: Special case for data embedded in inode.
589  */
590 static
591 hammer2_chain_t *
592 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
593                         hammer2_key_t lbase, int pblksize,
594                         hammer2_tid_t mtid, char **datap, int *errorp)
595 {
596         hammer2_chain_t *chain;
597         hammer2_key_t key_dummy;
598         hammer2_off_t dedup_off;
599         int pradix = hammer2_getradix(pblksize);
600         int cache_index = -1;
601
602         /*
603          * Locate the chain associated with lbase, return a locked chain.
604          * However, do not instantiate any data reference (which utilizes a
605          * device buffer) because we will be using direct IO via the
606          * logical buffer cache buffer.
607          */
608         *errorp = 0;
609         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
610 retry:
611         chain = hammer2_chain_lookup(parentp, &key_dummy,
612                                      lbase, lbase,
613                                      &cache_index,
614                                      HAMMER2_LOOKUP_NODATA);
615         if (chain == NULL) {
616                 /*
617                  * We found a hole, create a new chain entry.
618                  *
619                  * NOTE: DATA chains are created without device backing
620                  *       store (nor do we want any).
621                  */
622                 dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap,
623                                                  pblksize);
624                 *errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
625                                                lbase, HAMMER2_PBUFRADIX,
626                                                HAMMER2_BREF_TYPE_DATA,
627                                                pblksize, mtid,
628                                                dedup_off, 0);
629                 if (chain == NULL) {
630                         panic("hammer2_chain_create: par=%p error=%d\n",
631                               *parentp, *errorp);
632                         goto retry;
633                 }
634                 /*ip->delta_dcount += pblksize;*/
635         } else {
636                 switch (chain->bref.type) {
637                 case HAMMER2_BREF_TYPE_INODE:
638                         /*
639                          * The data is embedded in the inode, which requires
640                          * a bit more finess.
641                          */
642                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
643                         break;
644                 case HAMMER2_BREF_TYPE_DATA:
645                         dedup_off = hammer2_dedup_lookup(chain->hmp, datap,
646                                                          pblksize);
647                         if (chain->bytes != pblksize) {
648                                 hammer2_chain_resize(ip, *parentp, chain,
649                                                      mtid, dedup_off,
650                                                      pradix,
651                                                      HAMMER2_MODIFY_OPTDATA);
652                         }
653
654                         /*
655                          * DATA buffers must be marked modified whether the
656                          * data is in a logical buffer or not.  We also have
657                          * to make this call to fixup the chain data pointers
658                          * after resizing in case this is an encrypted or
659                          * compressed buffer.
660                          */
661                         hammer2_chain_modify(chain, mtid, dedup_off,
662                                              HAMMER2_MODIFY_OPTDATA);
663                         break;
664                 default:
665                         panic("hammer2_assign_physical: bad type");
666                         /* NOT REACHED */
667                         break;
668                 }
669         }
670         return (chain);
671 }
672
673 /* 
674  * hammer2_write_file_core() - hammer2_write_thread() helper
675  *
676  * The core write function which determines which path to take
677  * depending on compression settings.  We also have to locate the
678  * related chains so we can calculate and set the check data for
679  * the blockref.
680  */
681 static
682 void
683 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
684                         hammer2_chain_t **parentp,
685                         hammer2_key_t lbase, int ioflag, int pblksize,
686                         hammer2_tid_t mtid, int *errorp)
687 {
688         hammer2_chain_t *chain;
689         char *data = bp->b_data;
690
691         switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
692         case HAMMER2_COMP_NONE:
693                 /*
694                  * We have to assign physical storage to the buffer
695                  * we intend to dirty or write now to avoid deadlocks
696                  * in the strategy code later.
697                  *
698                  * This can return NOOFFSET for inode-embedded data.
699                  * The strategy code will take care of it in that case.
700                  */
701                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
702                                                 mtid, &data, errorp);
703                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
704                         hammer2_inode_data_t *wipdata;
705
706                         wipdata = &chain->data->ipdata;
707                         KKASSERT(wipdata->meta.op_flags &
708                                  HAMMER2_OPFLAG_DIRECTDATA);
709                         KKASSERT(bp->b_loffset == 0);
710                         bcopy(bp->b_data, wipdata->u.data,
711                               HAMMER2_EMBEDDED_BYTES);
712                         ++hammer2_iod_file_wembed;
713                 } else if (data == NULL) {
714                         /*
715                          * Copy of data already present on-media.
716                          */
717                         chain->bref.methods =
718                                 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
719                                 HAMMER2_ENC_CHECK(ip->meta.check_algo);
720                         hammer2_chain_setcheck(chain, bp->b_data);
721                 } else {
722                         hammer2_write_bp(chain, bp, ioflag, pblksize,
723                                          mtid, errorp, ip->meta.check_algo);
724                 }
725                 if (chain) {
726                         hammer2_chain_unlock(chain);
727                         hammer2_chain_drop(chain);
728                 }
729                 break;
730         case HAMMER2_COMP_AUTOZERO:
731                 /*
732                  * Check for zero-fill only
733                  */
734                 hammer2_zero_check_and_write(bp, ip, parentp,
735                                              lbase, ioflag, pblksize,
736                                              mtid, errorp,
737                                              ip->meta.check_algo);
738                 break;
739         case HAMMER2_COMP_LZ4:
740         case HAMMER2_COMP_ZLIB:
741         default:
742                 /*
743                  * Check for zero-fill and attempt compression.
744                  */
745                 hammer2_compress_and_write(bp, ip, parentp,
746                                            lbase, ioflag, pblksize,
747                                            mtid, errorp,
748                                            ip->meta.comp_algo,
749                                            ip->meta.check_algo);
750                 break;
751         }
752 }
753
754 /*
755  * Helper
756  *
757  * Generic function that will perform the compression in compression
758  * write path. The compression algorithm is determined by the settings
759  * obtained from inode.
760  */
761 static
762 void
763 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
764         hammer2_chain_t **parentp,
765         hammer2_key_t lbase, int ioflag, int pblksize,
766         hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo)
767 {
768         hammer2_chain_t *chain;
769         int comp_size;
770         int comp_block_size;
771         char *comp_buffer;
772         char *data;
773
774         if (test_block_zeros(bp->b_data, pblksize)) {
775                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
776                 return;
777         }
778
779         comp_size = 0;
780         comp_buffer = NULL;
781
782         KKASSERT(pblksize / 2 <= 32768);
783                 
784         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
785                 z_stream strm_compress;
786                 int comp_level;
787                 int ret;
788
789                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
790                 case HAMMER2_COMP_LZ4:
791                         comp_buffer = objcache_get(cache_buffer_write,
792                                                    M_INTWAIT);
793                         comp_size = LZ4_compress_limitedOutput(
794                                         bp->b_data,
795                                         &comp_buffer[sizeof(int)],
796                                         pblksize,
797                                         pblksize / 2 - sizeof(int));
798                         /*
799                          * We need to prefix with the size, LZ4
800                          * doesn't do it for us.  Add the related
801                          * overhead.
802                          */
803                         *(int *)comp_buffer = comp_size;
804                         if (comp_size)
805                                 comp_size += sizeof(int);
806                         break;
807                 case HAMMER2_COMP_ZLIB:
808                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
809                         if (comp_level == 0)
810                                 comp_level = 6; /* default zlib compression */
811                         else if (comp_level < 6)
812                                 comp_level = 6;
813                         else if (comp_level > 9)
814                                 comp_level = 9;
815                         ret = deflateInit(&strm_compress, comp_level);
816                         if (ret != Z_OK) {
817                                 kprintf("HAMMER2 ZLIB: fatal error "
818                                         "on deflateInit.\n");
819                         }
820
821                         comp_buffer = objcache_get(cache_buffer_write,
822                                                    M_INTWAIT);
823                         strm_compress.next_in = bp->b_data;
824                         strm_compress.avail_in = pblksize;
825                         strm_compress.next_out = comp_buffer;
826                         strm_compress.avail_out = pblksize / 2;
827                         ret = deflate(&strm_compress, Z_FINISH);
828                         if (ret == Z_STREAM_END) {
829                                 comp_size = pblksize / 2 -
830                                             strm_compress.avail_out;
831                         } else {
832                                 comp_size = 0;
833                         }
834                         ret = deflateEnd(&strm_compress);
835                         break;
836                 default:
837                         kprintf("Error: Unknown compression method.\n");
838                         kprintf("Comp_method = %d.\n", comp_algo);
839                         break;
840                 }
841         }
842
843         if (comp_size == 0) {
844                 /*
845                  * compression failed or turned off
846                  */
847                 comp_block_size = pblksize;     /* safety */
848                 if (++ip->comp_heuristic > 128)
849                         ip->comp_heuristic = 8;
850         } else {
851                 /*
852                  * compression succeeded
853                  */
854                 ip->comp_heuristic = 0;
855                 if (comp_size <= 1024) {
856                         comp_block_size = 1024;
857                 } else if (comp_size <= 2048) {
858                         comp_block_size = 2048;
859                 } else if (comp_size <= 4096) {
860                         comp_block_size = 4096;
861                 } else if (comp_size <= 8192) {
862                         comp_block_size = 8192;
863                 } else if (comp_size <= 16384) {
864                         comp_block_size = 16384;
865                 } else if (comp_size <= 32768) {
866                         comp_block_size = 32768;
867                 } else {
868                         panic("hammer2: WRITE PATH: "
869                               "Weird comp_size value.");
870                         /* NOT REACHED */
871                         comp_block_size = pblksize;
872                 }
873
874                 /*
875                  * Must zero the remainder or dedup (which operates on a
876                  * physical block basis) will not find matches.
877                  */
878                 if (comp_size < comp_block_size) {
879                         bzero(comp_buffer + comp_size,
880                               comp_block_size - comp_size);
881                 }
882         }
883
884         /*
885          * Assign physical storage, data will be set to NULL if a live-dedup
886          * was successful.
887          */
888         data = comp_size ? comp_buffer : bp->b_data;
889         chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size,
890                                         mtid, &data, errorp);
891
892         if (*errorp) {
893                 kprintf("WRITE PATH: An error occurred while "
894                         "assigning physical space.\n");
895                 KKASSERT(chain == NULL);
896                 goto done;
897         }
898
899         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
900                 hammer2_inode_data_t *wipdata;
901
902                 hammer2_chain_modify_ip(ip, chain, mtid, 0);
903                 wipdata = &chain->data->ipdata;
904                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
905                 KKASSERT(bp->b_loffset == 0);
906                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
907                 ++hammer2_iod_file_wembed;
908         } else if (data == NULL) {
909                 /*
910                  * Live deduplication, a copy of the data is already present
911                  * on the media.
912                  */
913                 char *bdata;
914
915                 if (comp_size) {
916                         chain->bref.methods =
917                                 HAMMER2_ENC_COMP(comp_algo) +
918                                 HAMMER2_ENC_CHECK(check_algo);
919                 } else {
920                         chain->bref.methods =
921                                 HAMMER2_ENC_COMP(
922                                         HAMMER2_COMP_NONE) +
923                                 HAMMER2_ENC_CHECK(check_algo);
924                 }
925                 bdata = comp_size ? comp_buffer : bp->b_data;
926                 hammer2_chain_setcheck(chain, bdata);
927                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
928         } else {
929                 hammer2_io_t *dio;
930                 char *bdata;
931
932                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
933
934                 switch(chain->bref.type) {
935                 case HAMMER2_BREF_TYPE_INODE:
936                         panic("hammer2_write_bp: unexpected inode\n");
937                         break;
938                 case HAMMER2_BREF_TYPE_DATA:
939                         /*
940                          * Optimize out the read-before-write
941                          * if possible.
942                          */
943                         *errorp = hammer2_io_newnz(chain->hmp,
944                                                    chain->bref.data_off,
945                                                    chain->bytes,
946                                                    &dio);
947                         if (*errorp) {
948                                 hammer2_io_brelse(&dio);
949                                 kprintf("hammer2: WRITE PATH: "
950                                         "dbp bread error\n");
951                                 break;
952                         }
953                         bdata = hammer2_io_data(dio, chain->bref.data_off);
954
955                         /*
956                          * When loading the block make sure we don't
957                          * leave garbage after the compressed data.
958                          */
959                         if (comp_size) {
960                                 chain->bref.methods =
961                                         HAMMER2_ENC_COMP(comp_algo) +
962                                         HAMMER2_ENC_CHECK(check_algo);
963                                 bcopy(comp_buffer, bdata, comp_size);
964                         } else {
965                                 chain->bref.methods =
966                                         HAMMER2_ENC_COMP(
967                                                 HAMMER2_COMP_NONE) +
968                                         HAMMER2_ENC_CHECK(check_algo);
969                                 bcopy(bp->b_data, bdata, pblksize);
970                         }
971
972                         /*
973                          * The flush code doesn't calculate check codes for
974                          * file data (doing so can result in excessive I/O),
975                          * so we do it here.
976                          */
977                         hammer2_chain_setcheck(chain, bdata);
978                         hammer2_dedup_record(chain, bdata);
979
980                         /*
981                          * Device buffer is now valid, chain is no longer in
982                          * the initial state.
983                          *
984                          * (No blockref table worries with file data)
985                          */
986                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
987
988                         /* Now write the related bdp. */
989                         if (ioflag & IO_SYNC) {
990                                 /*
991                                  * Synchronous I/O requested.
992                                  */
993                                 hammer2_io_bwrite(&dio);
994                         /*
995                         } else if ((ioflag & IO_DIRECT) &&
996                                    loff + n == pblksize) {
997                                 hammer2_io_bdwrite(&dio);
998                         */
999                         } else if (ioflag & IO_ASYNC) {
1000                                 hammer2_io_bawrite(&dio);
1001                         } else {
1002                                 hammer2_io_bdwrite(&dio);
1003                         }
1004                         break;
1005                 default:
1006                         panic("hammer2_write_bp: bad chain type %d\n",
1007                                 chain->bref.type);
1008                         /* NOT REACHED */
1009                         break;
1010                 }
1011         }
1012 done:
1013         if (chain) {
1014                 hammer2_chain_unlock(chain);
1015                 hammer2_chain_drop(chain);
1016         }
1017         if (comp_buffer)
1018                 objcache_put(cache_buffer_write, comp_buffer);
1019 }
1020
1021 /*
1022  * Helper
1023  *
1024  * Function that performs zero-checking and writing without compression,
1025  * it corresponds to default zero-checking path.
1026  */
1027 static
1028 void
1029 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
1030         hammer2_chain_t **parentp,
1031         hammer2_key_t lbase, int ioflag, int pblksize,
1032         hammer2_tid_t mtid, int *errorp,
1033         int check_algo)
1034 {
1035         hammer2_chain_t *chain;
1036         char *data = bp->b_data;
1037
1038         if (test_block_zeros(bp->b_data, pblksize)) {
1039                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
1040         } else {
1041                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
1042                                                 mtid, &data, errorp);
1043                 if (data) {
1044                         hammer2_write_bp(chain, bp, ioflag, pblksize,
1045                                          mtid, errorp, check_algo);
1046                 } /* else dedup occurred */
1047                 if (chain) {
1048                         hammer2_chain_unlock(chain);
1049                         hammer2_chain_drop(chain);
1050                 }
1051         }
1052 }
1053
1054 /*
1055  * Helper
1056  *
1057  * A function to test whether a block of data contains only zeros,
1058  * returns TRUE (non-zero) if the block is all zeros.
1059  */
1060 static
1061 int
1062 test_block_zeros(const char *buf, size_t bytes)
1063 {
1064         size_t i;
1065
1066         for (i = 0; i < bytes; i += sizeof(long)) {
1067                 if (*(const long *)(buf + i) != 0)
1068                         return (0);
1069         }
1070         return (1);
1071 }
1072
1073 /*
1074  * Helper
1075  *
1076  * Function to "write" a block that contains only zeros.
1077  */
1078 static
1079 void
1080 zero_write(struct buf *bp, hammer2_inode_t *ip,
1081            hammer2_chain_t **parentp,
1082            hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused)
1083 {
1084         hammer2_chain_t *chain;
1085         hammer2_key_t key_dummy;
1086         int cache_index = -1;
1087
1088         chain = hammer2_chain_lookup(parentp, &key_dummy,
1089                                      lbase, lbase,
1090                                      &cache_index,
1091                                      HAMMER2_LOOKUP_NODATA);
1092         if (chain) {
1093                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1094                         hammer2_inode_data_t *wipdata;
1095
1096                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
1097                         wipdata = &chain->data->ipdata;
1098                         KKASSERT(wipdata->meta.op_flags &
1099                                  HAMMER2_OPFLAG_DIRECTDATA);
1100                         KKASSERT(bp->b_loffset == 0);
1101                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1102                         ++hammer2_iod_file_wembed;
1103                 } else {
1104                         hammer2_chain_delete(*parentp, chain,
1105                                              mtid, HAMMER2_DELETE_PERMANENT);
1106                         ++hammer2_iod_file_wzero;
1107                 }
1108                 hammer2_chain_unlock(chain);
1109                 hammer2_chain_drop(chain);
1110         } else {
1111                 ++hammer2_iod_file_wzero;
1112         }
1113 }
1114
1115 /*
1116  * Helper
1117  *
1118  * Function to write the data as it is, without performing any sort of
1119  * compression. This function is used in path without compression and
1120  * default zero-checking path.
1121  */
1122 static
1123 void
1124 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1125                  int pblksize,
1126                  hammer2_tid_t mtid, int *errorp, int check_algo)
1127 {
1128         hammer2_inode_data_t *wipdata;
1129         hammer2_io_t *dio;
1130         char *bdata;
1131         int error;
1132
1133         error = 0;      /* XXX TODO below */
1134
1135         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1136
1137         switch(chain->bref.type) {
1138         case HAMMER2_BREF_TYPE_INODE:
1139                 wipdata = &chain->data->ipdata;
1140                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1141                 KKASSERT(bp->b_loffset == 0);
1142                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1143                 error = 0;
1144                 ++hammer2_iod_file_wembed;
1145                 break;
1146         case HAMMER2_BREF_TYPE_DATA:
1147                 error = hammer2_io_newnz(chain->hmp,
1148                                          chain->bref.data_off,
1149                                          chain->bytes, &dio);
1150                 if (error) {
1151                         hammer2_io_bqrelse(&dio);
1152                         kprintf("hammer2: WRITE PATH: "
1153                                 "dbp bread error\n");
1154                         break;
1155                 }
1156                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1157
1158                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1159                                       HAMMER2_ENC_CHECK(check_algo);
1160                 bcopy(bp->b_data, bdata, chain->bytes);
1161
1162                 /*
1163                  * The flush code doesn't calculate check codes for
1164                  * file data (doing so can result in excessive I/O),
1165                  * so we do it here.
1166                  */
1167                 hammer2_chain_setcheck(chain, bdata);
1168                 hammer2_dedup_record(chain, bdata);
1169
1170                 /*
1171                  * Device buffer is now valid, chain is no longer in
1172                  * the initial state.
1173                  *
1174                  * (No blockref table worries with file data)
1175                  */
1176                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1177
1178                 if (ioflag & IO_SYNC) {
1179                         /*
1180                          * Synchronous I/O requested.
1181                          */
1182                         hammer2_io_bwrite(&dio);
1183                 /*
1184                 } else if ((ioflag & IO_DIRECT) &&
1185                            loff + n == pblksize) {
1186                         hammer2_io_bdwrite(&dio);
1187                 */
1188                 } else if (ioflag & IO_ASYNC) {
1189                         hammer2_io_bawrite(&dio);
1190                 } else {
1191                         hammer2_io_bdwrite(&dio);
1192                 }
1193                 break;
1194         default:
1195                 panic("hammer2_write_bp: bad chain type %d\n",
1196                       chain->bref.type);
1197                 /* NOT REACHED */
1198                 error = 0;
1199                 break;
1200         }
1201         KKASSERT(error == 0);   /* XXX TODO */
1202         *errorp = error;
1203 }
1204
1205 /*
1206  * LIVE DEDUP HEURISTIC
1207  *
1208  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
1209  *          All fields must be loaded into locals and validated.
1210  */
1211 static
1212 void
1213 hammer2_dedup_record(hammer2_chain_t *chain, char *data)
1214 {
1215         hammer2_dev_t *hmp;
1216         hammer2_dedup_t *dedup;
1217         int32_t crc;
1218         int best = 0;
1219         int i;
1220         int dticks;
1221
1222         hmp = chain->hmp;
1223         crc = hammer2_icrc32(data, chain->bytes);
1224         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1225         for (i = 0; i < 4; ++i) {
1226                 if (dedup[i].data_crc == crc) {
1227                         best = i;
1228                         break;
1229                 }
1230                 dticks = (int)(dedup[i].ticks - dedup[best].ticks);
1231                 if (dticks < 0 || dticks > hz * 60 * 30)
1232                         best = i;
1233         }
1234         dedup += best;
1235         if (hammer2_debug & 0x40000) {
1236                 kprintf("REC %04x %08x %016jx\n",
1237                         (int)(dedup - hmp->heur_dedup),
1238                         crc,
1239                         chain->bref.data_off);
1240         }
1241         dedup->ticks = ticks;
1242         dedup->data_off = chain->bref.data_off;
1243         dedup->data_crc = crc;
1244         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
1245 }
1246
1247 static
1248 hammer2_off_t
1249 hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize)
1250 {
1251         hammer2_dedup_t *dedup;
1252         hammer2_io_t *dio;
1253         hammer2_off_t off;
1254         uint32_t crc;
1255         char *data;
1256         int i;
1257
1258         data = *datap;
1259         if (data == NULL)
1260                 return 0;
1261
1262         crc = hammer2_icrc32(data, pblksize);
1263         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1264
1265         if (hammer2_debug & 0x40000) {
1266                 kprintf("LOC %04x/4 %08x\n",
1267                         (int)(dedup - hmp->heur_dedup),
1268                         crc);
1269         }
1270
1271         for (i = 0; i < 4; ++i) {
1272                 off = dedup[i].data_off;
1273                 cpu_ccfence();
1274                 if (dedup[i].data_crc != crc)
1275                         continue;
1276                 if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize)
1277                         continue;
1278                 dio = hammer2_io_getquick(hmp, off, pblksize);
1279                 if (dio &&
1280                     bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) {
1281                         if (hammer2_debug & 0x40000) {
1282                                 kprintf("DEDUP SUCCESS %016jx\n",
1283                                         (intmax_t)off);
1284                         }
1285                         hammer2_io_putblk(&dio);
1286                         *datap = NULL;
1287                         dedup[i].ticks = ticks; /* update use */
1288                         ++hammer2_iod_file_wdedup;
1289                         return off;             /* RETURN */
1290                 }
1291                 if (dio)
1292                         hammer2_io_putblk(&dio);
1293         }
1294         return 0;
1295 }
1296
1297 /*
1298  * Poof.  Races are ok, if someone gets in and reuses a dedup offset
1299  * before or while we are clearing it they will also recover the freemap
1300  * entry (set it to fully allocated), so a bulkfree race can only set it
1301  * to a possibly-free state.
1302  *
1303  * XXX ok, well, not really sure races are ok but going to run with it
1304  *     for the moment.
1305  */
1306 void
1307 hammer2_dedup_clear(hammer2_dev_t *hmp)
1308 {
1309         int i;
1310
1311         for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) {
1312                 hmp->heur_dedup[i].data_off = 0;
1313                 hmp->heur_dedup[i].ticks = ticks - 1;
1314         }
1315 }