hammer2 - bug fixes
[dragonfly.git] / sys / vfs / hammer2 / hammer2_strategy.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  *
40  * [De]compression, zero-block, check codes, and buffer cache operations
41  * for file data is handled here.
42  *
43  * Live dedup makes its home here as well.
44  */
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/buf.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/mount.h>
54 #include <sys/vnode.h>
55 #include <sys/mountctl.h>
56 #include <sys/dirent.h>
57 #include <sys/uio.h>
58 #include <sys/objcache.h>
59 #include <sys/event.h>
60 #include <sys/file.h>
61 #include <vfs/fifofs/fifo.h>
62
63 #include "hammer2.h"
64 #include "hammer2_lz4.h"
65
66 #include "zlib/hammer2_zlib.h"
67
68 struct objcache *cache_buffer_read;
69 struct objcache *cache_buffer_write;
70
71 /*
72  * Strategy code (async logical file buffer I/O from system)
73  *
74  * WARNING: The strategy code cannot safely use hammer2 transactions
75  *          as this can deadlock against vfs_sync's vfsync() call
76  *          if multiple flushes are queued.  All H2 structures must
77  *          already be present and ready for the DIO.
78  *
79  *          Reads can be initiated asynchronously, writes have to be
80  *          spooled to a separate thread for action to avoid deadlocks.
81  */
82 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
83 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
84 static int hammer2_strategy_read(struct vop_strategy_args *ap);
85 static int hammer2_strategy_write(struct vop_strategy_args *ap);
86 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
87                                 char *data, struct bio *bio);
88
89 static void hammer2_dedup_record(hammer2_chain_t *chain, char *data);
90 static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp,
91                         char **datap, int pblksize);
92
93 int
94 hammer2_vop_strategy(struct vop_strategy_args *ap)
95 {
96         struct bio *biop;
97         struct buf *bp;
98         int error;
99
100         biop = ap->a_bio;
101         bp = biop->bio_buf;
102
103         switch(bp->b_cmd) {
104         case BUF_CMD_READ:
105                 error = hammer2_strategy_read(ap);
106                 ++hammer2_iod_file_read;
107                 break;
108         case BUF_CMD_WRITE:
109                 error = hammer2_strategy_write(ap);
110                 ++hammer2_iod_file_write;
111                 break;
112         default:
113                 bp->b_error = error = EINVAL;
114                 bp->b_flags |= B_ERROR;
115                 biodone(biop);
116                 break;
117         }
118         return (error);
119 }
120
121 /*
122  * Return the largest contiguous physical disk range for the logical
123  * request, in bytes.
124  *
125  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
126  *
127  * Basically disabled, the logical buffer write thread has to deal with
128  * buffers one-at-a-time.
129  */
130 int
131 hammer2_vop_bmap(struct vop_bmap_args *ap)
132 {
133         *ap->a_doffsetp = NOOFFSET;
134         if (ap->a_runp)
135                 *ap->a_runp = 0;
136         if (ap->a_runb)
137                 *ap->a_runb = 0;
138         return (EOPNOTSUPP);
139 }
140
141 /****************************************************************************
142  *                              READ SUPPORT                                *
143  ****************************************************************************/
144 /* 
145  * Callback used in read path in case that a block is compressed with LZ4.
146  */
147 static
148 void
149 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
150 {
151         struct buf *bp;
152         char *compressed_buffer;
153         int compressed_size;
154         int result;
155
156         bp = bio->bio_buf;
157
158 #if 0
159         if bio->bio_caller_info2.index &&
160               bio->bio_caller_info1.uvalue32 !=
161               crc32(bp->b_data, bp->b_bufsize) --- return error
162 #endif
163
164         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
165         compressed_size = *(const int *)data;
166         KKASSERT(compressed_size <= bytes - sizeof(int));
167
168         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
169         result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
170                                      compressed_buffer,
171                                      compressed_size,
172                                      bp->b_bufsize);
173         if (result < 0) {
174                 kprintf("READ PATH: Error during decompression."
175                         "bio %016jx/%d\n",
176                         (intmax_t)bio->bio_offset, bytes);
177                 /* make sure it isn't random garbage */
178                 bzero(compressed_buffer, bp->b_bufsize);
179         }
180         KKASSERT(result <= bp->b_bufsize);
181         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
182         if (result < bp->b_bufsize)
183                 bzero(bp->b_data + result, bp->b_bufsize - result);
184         objcache_put(cache_buffer_read, compressed_buffer);
185         bp->b_resid = 0;
186         bp->b_flags |= B_AGE;
187 }
188
189 /*
190  * Callback used in read path in case that a block is compressed with ZLIB.
191  * It is almost identical to LZ4 callback, so in theory they can be unified,
192  * but we didn't want to make changes in bio structure for that.
193  */
194 static
195 void
196 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
197 {
198         struct buf *bp;
199         char *compressed_buffer;
200         z_stream strm_decompress;
201         int result;
202         int ret;
203
204         bp = bio->bio_buf;
205
206         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
207         strm_decompress.avail_in = 0;
208         strm_decompress.next_in = Z_NULL;
209
210         ret = inflateInit(&strm_decompress);
211
212         if (ret != Z_OK)
213                 kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
214
215         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
216         strm_decompress.next_in = __DECONST(char *, data);
217
218         /* XXX supply proper size, subset of device bp */
219         strm_decompress.avail_in = bytes;
220         strm_decompress.next_out = compressed_buffer;
221         strm_decompress.avail_out = bp->b_bufsize;
222
223         ret = inflate(&strm_decompress, Z_FINISH);
224         if (ret != Z_STREAM_END) {
225                 kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
226                 bzero(compressed_buffer, bp->b_bufsize);
227         }
228         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
229         result = bp->b_bufsize - strm_decompress.avail_out;
230         if (result < bp->b_bufsize)
231                 bzero(bp->b_data + result, strm_decompress.avail_out);
232         objcache_put(cache_buffer_read, compressed_buffer);
233         ret = inflateEnd(&strm_decompress);
234
235         bp->b_resid = 0;
236         bp->b_flags |= B_AGE;
237 }
238
239 /*
240  * Logical buffer I/O, async read.
241  */
242 static
243 int
244 hammer2_strategy_read(struct vop_strategy_args *ap)
245 {
246         hammer2_xop_strategy_t *xop;
247         struct buf *bp;
248         struct bio *bio;
249         struct bio *nbio;
250         hammer2_inode_t *ip;
251         hammer2_key_t lbase;
252
253         bio = ap->a_bio;
254         bp = bio->bio_buf;
255         ip = VTOI(ap->a_vp);
256         nbio = push_bio(bio);
257
258         lbase = bio->bio_offset;
259         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
260
261         xop = hammer2_xop_alloc(ip, 0);
262         xop->finished = 0;
263         xop->bio = bio;
264         xop->lbase = lbase;
265         hammer2_mtx_init(&xop->lock, "h2bio");
266         hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
267
268         return(0);
269 }
270
271 /*
272  * Per-node XOP (threaded), do a synchronous lookup of the chain and
273  * its data.  The frontend is asynchronous, so we are also responsible
274  * for racing to terminate the frontend.
275  */
276 static
277 void
278 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
279 {
280         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
281         hammer2_chain_t *parent;
282         hammer2_chain_t *chain;
283         hammer2_key_t key_dummy;
284         hammer2_key_t lbase;
285         struct bio *bio;
286         struct buf *bp;
287         int cache_index = -1;
288         int error;
289
290         lbase = xop->lbase;
291         bio = xop->bio;
292         bp = bio->bio_buf;
293
294         parent = hammer2_inode_chain(xop->head.ip1, clindex,
295                                      HAMMER2_RESOLVE_ALWAYS |
296                                      HAMMER2_RESOLVE_SHARED);
297         if (parent) {
298                 chain = hammer2_chain_lookup(&parent, &key_dummy,
299                                              lbase, lbase,
300                                              &cache_index,
301                                              HAMMER2_LOOKUP_ALWAYS |
302                                              HAMMER2_LOOKUP_SHARED);
303                 error = chain ? chain->error : 0;
304         } else {
305                 error = EIO;
306                 chain = NULL;
307         }
308         error = hammer2_xop_feed(&xop->head, chain, clindex, error);
309         if (chain)
310                 hammer2_chain_drop(chain);
311         if (parent) {
312                 hammer2_chain_unlock(parent);
313                 hammer2_chain_drop(parent);
314         }
315         chain = NULL;   /* safety */
316         parent = NULL;  /* safety */
317
318         /*
319          * Race to finish the frontend
320          */
321         if (xop->finished)
322                 return;
323         hammer2_mtx_ex(&xop->lock);
324         if (xop->finished) {
325                 hammer2_mtx_unlock(&xop->lock);
326                 return;
327         }
328
329         /*
330          * Async operation has not completed and we now own the lock.
331          * Determine if we can complete the operation by issuing the
332          * frontend collection non-blocking.
333          */
334         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
335
336         switch(error) {
337         case 0:
338                 xop->finished = 1;
339                 hammer2_mtx_unlock(&xop->lock);
340                 chain = xop->head.cluster.focus;
341                 hammer2_strategy_read_completion(chain, (char *)chain->data,
342                                                  xop->bio);
343                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
344                 biodone(bio);
345                 break;
346         case ENOENT:
347                 xop->finished = 1;
348                 hammer2_mtx_unlock(&xop->lock);
349                 bp->b_resid = 0;
350                 bp->b_error = 0;
351                 bzero(bp->b_data, bp->b_bcount);
352                 biodone(bio);
353                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
354                 break;
355         case EINPROGRESS:
356                 hammer2_mtx_unlock(&xop->lock);
357                 break;
358         default:
359                 xop->finished = 1;
360                 hammer2_mtx_unlock(&xop->lock);
361                 bp->b_flags |= B_ERROR;
362                 bp->b_error = EIO;
363                 biodone(bio);
364                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
365                 break;
366         }
367 }
368
369 static
370 void
371 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
372                                  struct bio *bio)
373 {
374         struct buf *bp = bio->bio_buf;
375
376         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
377                 /*
378                  * Data is embedded in the inode (copy from inode).
379                  */
380                 bcopy(((hammer2_inode_data_t *)data)->u.data,
381                       bp->b_data, HAMMER2_EMBEDDED_BYTES);
382                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
383                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
384                 bp->b_resid = 0;
385                 bp->b_error = 0;
386         } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
387                 /*
388                  * Data is on-media, record for live dedup.
389                  */
390                 hammer2_dedup_record(chain, data);
391
392                 /*
393                  * Decopmression and copy.
394                  */
395                 switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
396                 case HAMMER2_COMP_LZ4:
397                         hammer2_decompress_LZ4_callback(data, chain->bytes,
398                                                         bio);
399                         break;
400                 case HAMMER2_COMP_ZLIB:
401                         hammer2_decompress_ZLIB_callback(data, chain->bytes,
402                                                          bio);
403                         break;
404                 case HAMMER2_COMP_NONE:
405                         KKASSERT(chain->bytes <= bp->b_bcount);
406                         bcopy(data, bp->b_data, chain->bytes);
407                         if (chain->bytes < bp->b_bcount) {
408                                 bzero(bp->b_data + chain->bytes,
409                                       bp->b_bcount - chain->bytes);
410                         }
411                         bp->b_flags |= B_NOTMETA;
412                         bp->b_resid = 0;
413                         bp->b_error = 0;
414                         break;
415                 default:
416                         panic("hammer2_strategy_read: "
417                               "unknown compression type");
418                 }
419         } else {
420                 panic("hammer2_strategy_read: unknown bref type");
421         }
422 }
423
424 /****************************************************************************
425  *                              WRITE SUPPORT                               *
426  ****************************************************************************/
427
428 /* 
429  * Functions for compression in threads,
430  * from hammer2_vnops.c
431  */
432 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
433                                 hammer2_chain_t **parentp,
434                                 hammer2_key_t lbase, int ioflag, int pblksize,
435                                 hammer2_tid_t mtid, int *errorp);
436 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
437                                 hammer2_chain_t **parentp,
438                                 hammer2_key_t lbase, int ioflag, int pblksize,
439                                 hammer2_tid_t mtid, int *errorp,
440                                 int comp_algo, int check_algo);
441 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
442                                 hammer2_chain_t **parentp,
443                                 hammer2_key_t lbase, int ioflag, int pblksize,
444                                 hammer2_tid_t mtid, int *errorp,
445                                 int check_algo);
446 static int test_block_zeros(const char *buf, size_t bytes);
447 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
448                                 hammer2_chain_t **parentp,
449                                 hammer2_key_t lbase,
450                                 hammer2_tid_t mtid, int *errorp);
451 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
452                                 int ioflag, int pblksize,
453                                 hammer2_tid_t mtid, int *errorp,
454                                 int check_algo);
455
456 static
457 int
458 hammer2_strategy_write(struct vop_strategy_args *ap)
459 {       
460         hammer2_xop_strategy_t *xop;
461         hammer2_pfs_t *pmp;
462         struct bio *bio;
463         struct buf *bp;
464         hammer2_inode_t *ip;
465         
466         bio = ap->a_bio;
467         bp = bio->bio_buf;
468         ip = VTOI(ap->a_vp);
469         pmp = ip->pmp;
470         
471         hammer2_lwinprog_ref(pmp);
472         hammer2_trans_assert_strategy(pmp);
473
474         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
475         xop->finished = 0;
476         xop->bio = bio;
477         xop->lbase = bio->bio_offset;
478         hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
479         /* asynchronous completion */
480
481         hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
482
483         return(0);
484 }
485
486 /*
487  * Per-node XOP (threaded).  Write the logical buffer to the media.
488  */
489 static
490 void
491 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
492 {
493         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
494         hammer2_chain_t *parent;
495         hammer2_key_t lbase;
496         hammer2_inode_t *ip;
497         struct bio *bio;
498         struct buf *bp;
499         int error;
500         int lblksize;
501         int pblksize;
502
503         lbase = xop->lbase;
504         bio = xop->bio;
505         bp = bio->bio_buf;
506         ip = xop->head.ip1;
507
508         /* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
509
510         lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
511         pblksize = hammer2_calc_physical(ip, lbase);
512         parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
513         hammer2_write_file_core(bp, ip, &parent,
514                                 lbase, IO_ASYNC, pblksize,
515                                 xop->head.mtid, &error);
516         if (parent) {
517                 hammer2_chain_unlock(parent);
518                 hammer2_chain_drop(parent);
519                 parent = NULL;  /* safety */
520         }
521         error = hammer2_xop_feed(&xop->head, NULL, clindex, error);
522
523         /*
524          * Race to finish the frontend
525          */
526         if (xop->finished)
527                 return;
528         hammer2_mtx_ex(&xop->lock);
529         if (xop->finished) {
530                 hammer2_mtx_unlock(&xop->lock);
531                 return;
532         }
533
534         /*
535          * Async operation has not completed and we now own the lock.
536          * Determine if we can complete the operation by issuing the
537          * frontend collection non-blocking.
538          */
539         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
540
541         switch(error) {
542         case ENOENT:
543         case 0:
544                 xop->finished = 1;
545                 hammer2_mtx_unlock(&xop->lock);
546                 bp->b_resid = 0;
547                 bp->b_error = 0;
548                 biodone(bio);
549                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
550                 hammer2_lwinprog_drop(ip->pmp);
551                 break;
552         case EINPROGRESS:
553                 hammer2_mtx_unlock(&xop->lock);
554                 break;
555         default:
556                 xop->finished = 1;
557                 hammer2_mtx_unlock(&xop->lock);
558                 bp->b_flags |= B_ERROR;
559                 bp->b_error = EIO;
560                 biodone(bio);
561                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
562                 hammer2_lwinprog_drop(ip->pmp);
563                 break;
564         }
565 }
566
567 /*
568  * Wait for pending I/O to complete
569  */
570 void
571 hammer2_bioq_sync(hammer2_pfs_t *pmp)
572 {
573         hammer2_lwinprog_wait(pmp, 0);
574 }
575
576 /* 
577  * Create a new cluster at (cparent, lbase) and assign physical storage,
578  * returning a cluster suitable for I/O.  The cluster will be in a modified
579  * state.
580  *
581  * cparent can wind up being anything.
582  *
583  * If datap is not NULL, *datap points to the real data we intend to write.
584  * If we can dedup the storage location we set *datap to NULL to indicate
585  * to the caller that a dedup occurred.
586  *
587  * NOTE: Special case for data embedded in inode.
588  */
589 static
590 hammer2_chain_t *
591 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
592                         hammer2_key_t lbase, int pblksize,
593                         hammer2_tid_t mtid, char **datap, int *errorp)
594 {
595         hammer2_chain_t *chain;
596         hammer2_key_t key_dummy;
597         hammer2_off_t dedup_off;
598         int pradix = hammer2_getradix(pblksize);
599         int cache_index = -1;
600
601         /*
602          * Locate the chain associated with lbase, return a locked chain.
603          * However, do not instantiate any data reference (which utilizes a
604          * device buffer) because we will be using direct IO via the
605          * logical buffer cache buffer.
606          */
607         *errorp = 0;
608         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
609 retry:
610         chain = hammer2_chain_lookup(parentp, &key_dummy,
611                                      lbase, lbase,
612                                      &cache_index,
613                                      HAMMER2_LOOKUP_NODATA);
614         if (chain == NULL) {
615                 /*
616                  * We found a hole, create a new chain entry.
617                  *
618                  * NOTE: DATA chains are created without device backing
619                  *       store (nor do we want any).
620                  */
621                 dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap,
622                                                  pblksize);
623                 *errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
624                                                lbase, HAMMER2_PBUFRADIX,
625                                                HAMMER2_BREF_TYPE_DATA,
626                                                pblksize, mtid,
627                                                dedup_off, 0);
628                 if (chain == NULL) {
629                         panic("hammer2_chain_create: par=%p error=%d\n",
630                               *parentp, *errorp);
631                         goto retry;
632                 }
633                 /*ip->delta_dcount += pblksize;*/
634         } else {
635                 switch (chain->bref.type) {
636                 case HAMMER2_BREF_TYPE_INODE:
637                         /*
638                          * The data is embedded in the inode, which requires
639                          * a bit more finess.
640                          */
641                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
642                         break;
643                 case HAMMER2_BREF_TYPE_DATA:
644                         dedup_off = hammer2_dedup_lookup(chain->hmp, datap,
645                                                          pblksize);
646                         if (chain->bytes != pblksize) {
647                                 hammer2_chain_resize(ip, *parentp, chain,
648                                                      mtid, dedup_off,
649                                                      pradix,
650                                                      HAMMER2_MODIFY_OPTDATA);
651                         }
652
653                         /*
654                          * DATA buffers must be marked modified whether the
655                          * data is in a logical buffer or not.  We also have
656                          * to make this call to fixup the chain data pointers
657                          * after resizing in case this is an encrypted or
658                          * compressed buffer.
659                          */
660                         hammer2_chain_modify(chain, mtid, dedup_off,
661                                              HAMMER2_MODIFY_OPTDATA);
662                         break;
663                 default:
664                         panic("hammer2_assign_physical: bad type");
665                         /* NOT REACHED */
666                         break;
667                 }
668         }
669         return (chain);
670 }
671
672 /* 
673  * hammer2_write_file_core() - hammer2_write_thread() helper
674  *
675  * The core write function which determines which path to take
676  * depending on compression settings.  We also have to locate the
677  * related chains so we can calculate and set the check data for
678  * the blockref.
679  */
680 static
681 void
682 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
683                         hammer2_chain_t **parentp,
684                         hammer2_key_t lbase, int ioflag, int pblksize,
685                         hammer2_tid_t mtid, int *errorp)
686 {
687         hammer2_chain_t *chain;
688         char *data = bp->b_data;
689
690         switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
691         case HAMMER2_COMP_NONE:
692                 /*
693                  * We have to assign physical storage to the buffer
694                  * we intend to dirty or write now to avoid deadlocks
695                  * in the strategy code later.
696                  *
697                  * This can return NOOFFSET for inode-embedded data.
698                  * The strategy code will take care of it in that case.
699                  */
700                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
701                                                 mtid, &data, errorp);
702                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
703                         hammer2_inode_data_t *wipdata;
704
705                         wipdata = &chain->data->ipdata;
706                         KKASSERT(wipdata->meta.op_flags &
707                                  HAMMER2_OPFLAG_DIRECTDATA);
708                         KKASSERT(bp->b_loffset == 0);
709                         bcopy(bp->b_data, wipdata->u.data,
710                               HAMMER2_EMBEDDED_BYTES);
711                         ++hammer2_iod_file_wembed;
712                 } else if (data == NULL) {
713                         /*
714                          * Copy of data already present on-media.
715                          */
716                         chain->bref.methods =
717                                 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
718                                 HAMMER2_ENC_CHECK(ip->meta.check_algo);
719                         hammer2_chain_setcheck(chain, bp->b_data);
720                 } else {
721                         hammer2_write_bp(chain, bp, ioflag, pblksize,
722                                          mtid, errorp, ip->meta.check_algo);
723                 }
724                 if (chain) {
725                         hammer2_chain_unlock(chain);
726                         hammer2_chain_drop(chain);
727                 }
728                 break;
729         case HAMMER2_COMP_AUTOZERO:
730                 /*
731                  * Check for zero-fill only
732                  */
733                 hammer2_zero_check_and_write(bp, ip, parentp,
734                                              lbase, ioflag, pblksize,
735                                              mtid, errorp,
736                                              ip->meta.check_algo);
737                 break;
738         case HAMMER2_COMP_LZ4:
739         case HAMMER2_COMP_ZLIB:
740         default:
741                 /*
742                  * Check for zero-fill and attempt compression.
743                  */
744                 hammer2_compress_and_write(bp, ip, parentp,
745                                            lbase, ioflag, pblksize,
746                                            mtid, errorp,
747                                            ip->meta.comp_algo,
748                                            ip->meta.check_algo);
749                 break;
750         }
751 }
752
753 /*
754  * Helper
755  *
756  * Generic function that will perform the compression in compression
757  * write path. The compression algorithm is determined by the settings
758  * obtained from inode.
759  */
760 static
761 void
762 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
763         hammer2_chain_t **parentp,
764         hammer2_key_t lbase, int ioflag, int pblksize,
765         hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo)
766 {
767         hammer2_chain_t *chain;
768         int comp_size;
769         int comp_block_size;
770         char *comp_buffer;
771         char *data;
772
773         if (test_block_zeros(bp->b_data, pblksize)) {
774                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
775                 return;
776         }
777
778         comp_size = 0;
779         comp_buffer = NULL;
780
781         KKASSERT(pblksize / 2 <= 32768);
782                 
783         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
784                 z_stream strm_compress;
785                 int comp_level;
786                 int ret;
787
788                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
789                 case HAMMER2_COMP_LZ4:
790                         comp_buffer = objcache_get(cache_buffer_write,
791                                                    M_INTWAIT);
792                         comp_size = LZ4_compress_limitedOutput(
793                                         bp->b_data,
794                                         &comp_buffer[sizeof(int)],
795                                         pblksize,
796                                         pblksize / 2 - sizeof(int));
797                         /*
798                          * We need to prefix with the size, LZ4
799                          * doesn't do it for us.  Add the related
800                          * overhead.
801                          */
802                         *(int *)comp_buffer = comp_size;
803                         if (comp_size)
804                                 comp_size += sizeof(int);
805                         break;
806                 case HAMMER2_COMP_ZLIB:
807                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
808                         if (comp_level == 0)
809                                 comp_level = 6; /* default zlib compression */
810                         else if (comp_level < 6)
811                                 comp_level = 6;
812                         else if (comp_level > 9)
813                                 comp_level = 9;
814                         ret = deflateInit(&strm_compress, comp_level);
815                         if (ret != Z_OK) {
816                                 kprintf("HAMMER2 ZLIB: fatal error "
817                                         "on deflateInit.\n");
818                         }
819
820                         comp_buffer = objcache_get(cache_buffer_write,
821                                                    M_INTWAIT);
822                         strm_compress.next_in = bp->b_data;
823                         strm_compress.avail_in = pblksize;
824                         strm_compress.next_out = comp_buffer;
825                         strm_compress.avail_out = pblksize / 2;
826                         ret = deflate(&strm_compress, Z_FINISH);
827                         if (ret == Z_STREAM_END) {
828                                 comp_size = pblksize / 2 -
829                                             strm_compress.avail_out;
830                         } else {
831                                 comp_size = 0;
832                         }
833                         ret = deflateEnd(&strm_compress);
834                         break;
835                 default:
836                         kprintf("Error: Unknown compression method.\n");
837                         kprintf("Comp_method = %d.\n", comp_algo);
838                         break;
839                 }
840         }
841
842         if (comp_size == 0) {
843                 /*
844                  * compression failed or turned off
845                  */
846                 comp_block_size = pblksize;     /* safety */
847                 if (++ip->comp_heuristic > 128)
848                         ip->comp_heuristic = 8;
849         } else {
850                 /*
851                  * compression succeeded
852                  */
853                 ip->comp_heuristic = 0;
854                 if (comp_size <= 1024) {
855                         comp_block_size = 1024;
856                 } else if (comp_size <= 2048) {
857                         comp_block_size = 2048;
858                 } else if (comp_size <= 4096) {
859                         comp_block_size = 4096;
860                 } else if (comp_size <= 8192) {
861                         comp_block_size = 8192;
862                 } else if (comp_size <= 16384) {
863                         comp_block_size = 16384;
864                 } else if (comp_size <= 32768) {
865                         comp_block_size = 32768;
866                 } else {
867                         panic("hammer2: WRITE PATH: "
868                               "Weird comp_size value.");
869                         /* NOT REACHED */
870                         comp_block_size = pblksize;
871                 }
872
873                 /*
874                  * Must zero the remainder or dedup (which operates on a
875                  * physical block basis) will not find matches.
876                  */
877                 if (comp_size < comp_block_size) {
878                         bzero(comp_buffer + comp_size,
879                               comp_block_size - comp_size);
880                 }
881         }
882
883         /*
884          * Assign physical storage, data will be set to NULL if a live-dedup
885          * was successful.
886          */
887         data = comp_size ? comp_buffer : bp->b_data;
888         chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size,
889                                         mtid, &data, errorp);
890
891         if (*errorp) {
892                 kprintf("WRITE PATH: An error occurred while "
893                         "assigning physical space.\n");
894                 KKASSERT(chain == NULL);
895                 goto done;
896         }
897
898         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
899                 hammer2_inode_data_t *wipdata;
900
901                 hammer2_chain_modify_ip(ip, chain, mtid, 0);
902                 wipdata = &chain->data->ipdata;
903                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
904                 KKASSERT(bp->b_loffset == 0);
905                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
906                 ++hammer2_iod_file_wembed;
907         } else if (data == NULL) {
908                 /*
909                  * Live deduplication, a copy of the data is already present
910                  * on the media.
911                  */
912                 char *bdata;
913
914                 if (comp_size) {
915                         chain->bref.methods =
916                                 HAMMER2_ENC_COMP(comp_algo) +
917                                 HAMMER2_ENC_CHECK(check_algo);
918                 } else {
919                         chain->bref.methods =
920                                 HAMMER2_ENC_COMP(
921                                         HAMMER2_COMP_NONE) +
922                                 HAMMER2_ENC_CHECK(check_algo);
923                 }
924                 bdata = comp_size ? comp_buffer : bp->b_data;
925                 hammer2_chain_setcheck(chain, bdata);
926                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
927         } else {
928                 hammer2_io_t *dio;
929                 char *bdata;
930
931                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
932
933                 switch(chain->bref.type) {
934                 case HAMMER2_BREF_TYPE_INODE:
935                         panic("hammer2_write_bp: unexpected inode\n");
936                         break;
937                 case HAMMER2_BREF_TYPE_DATA:
938                         /*
939                          * Optimize out the read-before-write
940                          * if possible.
941                          */
942                         *errorp = hammer2_io_newnz(chain->hmp,
943                                                    chain->bref.data_off,
944                                                    chain->bytes,
945                                                    &dio);
946                         if (*errorp) {
947                                 hammer2_io_brelse(&dio);
948                                 kprintf("hammer2: WRITE PATH: "
949                                         "dbp bread error\n");
950                                 break;
951                         }
952                         bdata = hammer2_io_data(dio, chain->bref.data_off);
953
954                         /*
955                          * When loading the block make sure we don't
956                          * leave garbage after the compressed data.
957                          */
958                         if (comp_size) {
959                                 chain->bref.methods =
960                                         HAMMER2_ENC_COMP(comp_algo) +
961                                         HAMMER2_ENC_CHECK(check_algo);
962                                 bcopy(comp_buffer, bdata, comp_size);
963                         } else {
964                                 chain->bref.methods =
965                                         HAMMER2_ENC_COMP(
966                                                 HAMMER2_COMP_NONE) +
967                                         HAMMER2_ENC_CHECK(check_algo);
968                                 bcopy(bp->b_data, bdata, pblksize);
969                         }
970
971                         /*
972                          * The flush code doesn't calculate check codes for
973                          * file data (doing so can result in excessive I/O),
974                          * so we do it here.
975                          */
976                         hammer2_chain_setcheck(chain, bdata);
977                         hammer2_dedup_record(chain, bdata);
978
979                         /*
980                          * Device buffer is now valid, chain is no longer in
981                          * the initial state.
982                          *
983                          * (No blockref table worries with file data)
984                          */
985                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
986
987                         /* Now write the related bdp. */
988                         if (ioflag & IO_SYNC) {
989                                 /*
990                                  * Synchronous I/O requested.
991                                  */
992                                 hammer2_io_bwrite(&dio);
993                         /*
994                         } else if ((ioflag & IO_DIRECT) &&
995                                    loff + n == pblksize) {
996                                 hammer2_io_bdwrite(&dio);
997                         */
998                         } else if (ioflag & IO_ASYNC) {
999                                 hammer2_io_bawrite(&dio);
1000                         } else {
1001                                 hammer2_io_bdwrite(&dio);
1002                         }
1003                         break;
1004                 default:
1005                         panic("hammer2_write_bp: bad chain type %d\n",
1006                                 chain->bref.type);
1007                         /* NOT REACHED */
1008                         break;
1009                 }
1010         }
1011 done:
1012         if (chain) {
1013                 hammer2_chain_unlock(chain);
1014                 hammer2_chain_drop(chain);
1015         }
1016         if (comp_buffer)
1017                 objcache_put(cache_buffer_write, comp_buffer);
1018 }
1019
1020 /*
1021  * Helper
1022  *
1023  * Function that performs zero-checking and writing without compression,
1024  * it corresponds to default zero-checking path.
1025  */
1026 static
1027 void
1028 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
1029         hammer2_chain_t **parentp,
1030         hammer2_key_t lbase, int ioflag, int pblksize,
1031         hammer2_tid_t mtid, int *errorp,
1032         int check_algo)
1033 {
1034         hammer2_chain_t *chain;
1035         char *data = bp->b_data;
1036
1037         if (test_block_zeros(bp->b_data, pblksize)) {
1038                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
1039         } else {
1040                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
1041                                                 mtid, &data, errorp);
1042                 if (data) {
1043                         hammer2_write_bp(chain, bp, ioflag, pblksize,
1044                                          mtid, errorp, check_algo);
1045                 } /* else dedup occurred */
1046                 if (chain) {
1047                         hammer2_chain_unlock(chain);
1048                         hammer2_chain_drop(chain);
1049                 }
1050         }
1051 }
1052
1053 /*
1054  * Helper
1055  *
1056  * A function to test whether a block of data contains only zeros,
1057  * returns TRUE (non-zero) if the block is all zeros.
1058  */
1059 static
1060 int
1061 test_block_zeros(const char *buf, size_t bytes)
1062 {
1063         size_t i;
1064
1065         for (i = 0; i < bytes; i += sizeof(long)) {
1066                 if (*(const long *)(buf + i) != 0)
1067                         return (0);
1068         }
1069         return (1);
1070 }
1071
1072 /*
1073  * Helper
1074  *
1075  * Function to "write" a block that contains only zeros.
1076  */
1077 static
1078 void
1079 zero_write(struct buf *bp, hammer2_inode_t *ip,
1080            hammer2_chain_t **parentp,
1081            hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused)
1082 {
1083         hammer2_chain_t *chain;
1084         hammer2_key_t key_dummy;
1085         int cache_index = -1;
1086
1087         chain = hammer2_chain_lookup(parentp, &key_dummy,
1088                                      lbase, lbase,
1089                                      &cache_index,
1090                                      HAMMER2_LOOKUP_NODATA);
1091         if (chain) {
1092                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1093                         hammer2_inode_data_t *wipdata;
1094
1095                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
1096                         wipdata = &chain->data->ipdata;
1097                         KKASSERT(wipdata->meta.op_flags &
1098                                  HAMMER2_OPFLAG_DIRECTDATA);
1099                         KKASSERT(bp->b_loffset == 0);
1100                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1101                         ++hammer2_iod_file_wembed;
1102                 } else {
1103                         hammer2_chain_delete(*parentp, chain,
1104                                              mtid, HAMMER2_DELETE_PERMANENT);
1105                         ++hammer2_iod_file_wzero;
1106                 }
1107                 hammer2_chain_unlock(chain);
1108                 hammer2_chain_drop(chain);
1109         } else {
1110                 ++hammer2_iod_file_wzero;
1111         }
1112 }
1113
1114 /*
1115  * Helper
1116  *
1117  * Function to write the data as it is, without performing any sort of
1118  * compression. This function is used in path without compression and
1119  * default zero-checking path.
1120  */
1121 static
1122 void
1123 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1124                  int pblksize,
1125                  hammer2_tid_t mtid, int *errorp, int check_algo)
1126 {
1127         hammer2_inode_data_t *wipdata;
1128         hammer2_io_t *dio;
1129         char *bdata;
1130         int error;
1131
1132         error = 0;      /* XXX TODO below */
1133
1134         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1135
1136         switch(chain->bref.type) {
1137         case HAMMER2_BREF_TYPE_INODE:
1138                 wipdata = &chain->data->ipdata;
1139                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1140                 KKASSERT(bp->b_loffset == 0);
1141                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1142                 error = 0;
1143                 ++hammer2_iod_file_wembed;
1144                 break;
1145         case HAMMER2_BREF_TYPE_DATA:
1146                 error = hammer2_io_newnz(chain->hmp,
1147                                          chain->bref.data_off,
1148                                          chain->bytes, &dio);
1149                 if (error) {
1150                         hammer2_io_bqrelse(&dio);
1151                         kprintf("hammer2: WRITE PATH: "
1152                                 "dbp bread error\n");
1153                         break;
1154                 }
1155                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1156
1157                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1158                                       HAMMER2_ENC_CHECK(check_algo);
1159                 bcopy(bp->b_data, bdata, chain->bytes);
1160
1161                 /*
1162                  * The flush code doesn't calculate check codes for
1163                  * file data (doing so can result in excessive I/O),
1164                  * so we do it here.
1165                  */
1166                 hammer2_chain_setcheck(chain, bdata);
1167                 hammer2_dedup_record(chain, bdata);
1168
1169                 /*
1170                  * Device buffer is now valid, chain is no longer in
1171                  * the initial state.
1172                  *
1173                  * (No blockref table worries with file data)
1174                  */
1175                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1176
1177                 if (ioflag & IO_SYNC) {
1178                         /*
1179                          * Synchronous I/O requested.
1180                          */
1181                         hammer2_io_bwrite(&dio);
1182                 /*
1183                 } else if ((ioflag & IO_DIRECT) &&
1184                            loff + n == pblksize) {
1185                         hammer2_io_bdwrite(&dio);
1186                 */
1187                 } else if (ioflag & IO_ASYNC) {
1188                         hammer2_io_bawrite(&dio);
1189                 } else {
1190                         hammer2_io_bdwrite(&dio);
1191                 }
1192                 break;
1193         default:
1194                 panic("hammer2_write_bp: bad chain type %d\n",
1195                       chain->bref.type);
1196                 /* NOT REACHED */
1197                 error = 0;
1198                 break;
1199         }
1200         KKASSERT(error == 0);   /* XXX TODO */
1201         *errorp = error;
1202 }
1203
1204 /*
1205  * LIVE DEDUP HEURISTIC
1206  *
1207  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
1208  *          All fields must be loaded into locals and validated.
1209  */
1210 static
1211 void
1212 hammer2_dedup_record(hammer2_chain_t *chain, char *data)
1213 {
1214         hammer2_dev_t *hmp;
1215         hammer2_dedup_t *dedup;
1216         int32_t crc;
1217         int best = 0;
1218         int i;
1219         int dticks;
1220
1221         hmp = chain->hmp;
1222         crc = hammer2_icrc32(data, chain->bytes);
1223         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1224         for (i = 0; i < 4; ++i) {
1225                 if (dedup[i].data_crc == crc) {
1226                         best = i;
1227                         break;
1228                 }
1229                 dticks = (int)(dedup[i].ticks - dedup[best].ticks);
1230                 if (dticks < 0 || dticks > hz * 60 * 30)
1231                         best = i;
1232         }
1233         dedup += best;
1234         if (hammer2_debug & 0x40000) {
1235                 kprintf("REC %04x %08x %016jx\n",
1236                         (int)(dedup - hmp->heur_dedup),
1237                         crc,
1238                         chain->bref.data_off);
1239         }
1240         dedup->ticks = ticks;
1241         dedup->data_off = chain->bref.data_off;
1242         dedup->data_crc = crc;
1243         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
1244 }
1245
1246 static
1247 hammer2_off_t
1248 hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize)
1249 {
1250         hammer2_dedup_t *dedup;
1251         hammer2_io_t *dio;
1252         hammer2_off_t off;
1253         uint32_t crc;
1254         char *data;
1255         int i;
1256
1257         data = *datap;
1258         if (data == NULL)
1259                 return 0;
1260
1261         crc = hammer2_icrc32(data, pblksize);
1262         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1263
1264         if (hammer2_debug & 0x40000) {
1265                 kprintf("LOC %04x/4 %08x\n",
1266                         (int)(dedup - hmp->heur_dedup),
1267                         crc);
1268         }
1269
1270         for (i = 0; i < 4; ++i) {
1271                 off = dedup[i].data_off;
1272                 cpu_ccfence();
1273                 if (dedup[i].data_crc != crc)
1274                         continue;
1275                 if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize)
1276                         continue;
1277                 dio = hammer2_io_getquick(hmp, off, pblksize);
1278                 if (dio &&
1279                     bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) {
1280                         if (hammer2_debug & 0x40000) {
1281                                 kprintf("DEDUP SUCCESS %016jx\n",
1282                                         (intmax_t)off);
1283                         }
1284                         hammer2_io_putblk(&dio);
1285                         *datap = NULL;
1286                         dedup[i].ticks = ticks; /* update use */
1287                         ++hammer2_iod_file_wdedup;
1288                         return off;             /* RETURN */
1289                 }
1290                 if (dio)
1291                         hammer2_io_putblk(&dio);
1292         }
1293         return 0;
1294 }
1295
1296 /*
1297  * Poof.  Races are ok, if someone gets in and reuses a dedup offset
1298  * before or while we are clearing it they will also recover the freemap
1299  * entry (set it to fully allocated), so a bulkfree race can only set it
1300  * to a possibly-free state.
1301  *
1302  * XXX ok, well, not really sure races are ok but going to run with it
1303  *     for the moment.
1304  */
1305 void
1306 hammer2_dedup_clear(hammer2_dev_t *hmp)
1307 {
1308         int i;
1309
1310         for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) {
1311                 hmp->heur_dedup[i].data_off = 0;
1312                 hmp->heur_dedup[i].ticks = ticks - 1;
1313         }
1314 }