Merge branch 'vendor/LIBRESSL'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_strategy.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  *
40  * [De]compression, zero-block, check codes, and buffer cache operations
41  * for file data is handled here.
42  *
43  * Live dedup makes its home here as well.
44  */
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/buf.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/mount.h>
54 #include <sys/vnode.h>
55 #include <sys/mountctl.h>
56 #include <sys/dirent.h>
57 #include <sys/uio.h>
58 #include <sys/objcache.h>
59 #include <sys/event.h>
60 #include <sys/file.h>
61 #include <vfs/fifofs/fifo.h>
62
63 #include "hammer2.h"
64 #include "hammer2_lz4.h"
65
66 #include "zlib/hammer2_zlib.h"
67
68 struct objcache *cache_buffer_read;
69 struct objcache *cache_buffer_write;
70
71 /*
72  * Strategy code (async logical file buffer I/O from system)
73  *
74  * Except for the transaction init (which should normally not block),
75  * we essentially run the strategy operation asynchronously via a XOP.
76  *
77  * XXX This isn't supposed to be able to deadlock against vfs_sync vfsync()
78  *     calls but it has in the past when multiple flushes are queued.
79  *
80  * XXX We currently terminate the transaction once we get a quorum, otherwise
81  *     the frontend can stall, but this can leave the remaining nodes with
82  *     a potential flush conflict.  We need to delay flushes on those nodes
83  *     until running transactions complete separately from the normal
84  *     transaction sequencing.  FIXME TODO.
85  */
86 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
87 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
88 static int hammer2_strategy_read(struct vop_strategy_args *ap);
89 static int hammer2_strategy_write(struct vop_strategy_args *ap);
90 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
91                                 char *data, struct bio *bio);
92
93 static hammer2_off_t hammer2_dedup_lookup(hammer2_dev_t *hmp,
94                         char **datap, int pblksize);
95
96 int h2timer[32];
97 int h2last;
98 int h2lid;
99
100 #define TIMER(which)    do {                            \
101         if (h2last)                                     \
102                 h2timer[h2lid] += (int)(ticks - h2last);\
103         h2last = ticks;                                 \
104         h2lid = which;                                  \
105 } while(0)
106
107 int
108 hammer2_vop_strategy(struct vop_strategy_args *ap)
109 {
110         struct bio *biop;
111         struct buf *bp;
112         int error;
113
114         biop = ap->a_bio;
115         bp = biop->bio_buf;
116
117         switch(bp->b_cmd) {
118         case BUF_CMD_READ:
119                 error = hammer2_strategy_read(ap);
120                 ++hammer2_iod_file_read;
121                 break;
122         case BUF_CMD_WRITE:
123                 error = hammer2_strategy_write(ap);
124                 ++hammer2_iod_file_write;
125                 break;
126         default:
127                 bp->b_error = error = EINVAL;
128                 bp->b_flags |= B_ERROR;
129                 biodone(biop);
130                 break;
131         }
132         return (error);
133 }
134
135 /*
136  * Return the largest contiguous physical disk range for the logical
137  * request, in bytes.
138  *
139  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
140  *
141  * Basically disabled, the logical buffer write thread has to deal with
142  * buffers one-at-a-time.  Note that this should not prevent cluster_read()
143  * from reading-ahead, it simply prevents it from trying form a single
144  * cluster buffer for the logical request.  H2 already uses 64KB buffers!
145  */
146 int
147 hammer2_vop_bmap(struct vop_bmap_args *ap)
148 {
149         *ap->a_doffsetp = NOOFFSET;
150         if (ap->a_runp)
151                 *ap->a_runp = 0;
152         if (ap->a_runb)
153                 *ap->a_runb = 0;
154         return (EOPNOTSUPP);
155 }
156
157 /****************************************************************************
158  *                              READ SUPPORT                                *
159  ****************************************************************************/
160 /* 
161  * Callback used in read path in case that a block is compressed with LZ4.
162  */
163 static
164 void
165 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
166 {
167         struct buf *bp;
168         char *compressed_buffer;
169         int compressed_size;
170         int result;
171
172         bp = bio->bio_buf;
173
174 #if 0
175         if bio->bio_caller_info2.index &&
176               bio->bio_caller_info1.uvalue32 !=
177               crc32(bp->b_data, bp->b_bufsize) --- return error
178 #endif
179
180         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
181         compressed_size = *(const int *)data;
182         KKASSERT((uint32_t)compressed_size <= bytes - sizeof(int));
183
184         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
185         result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
186                                      compressed_buffer,
187                                      compressed_size,
188                                      bp->b_bufsize);
189         if (result < 0) {
190                 kprintf("READ PATH: Error during decompression."
191                         "bio %016jx/%d\n",
192                         (intmax_t)bio->bio_offset, bytes);
193                 /* make sure it isn't random garbage */
194                 bzero(compressed_buffer, bp->b_bufsize);
195         }
196         KKASSERT(result <= bp->b_bufsize);
197         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
198         if (result < bp->b_bufsize)
199                 bzero(bp->b_data + result, bp->b_bufsize - result);
200         objcache_put(cache_buffer_read, compressed_buffer);
201         bp->b_resid = 0;
202         bp->b_flags |= B_AGE;
203 }
204
205 /*
206  * Callback used in read path in case that a block is compressed with ZLIB.
207  * It is almost identical to LZ4 callback, so in theory they can be unified,
208  * but we didn't want to make changes in bio structure for that.
209  */
210 static
211 void
212 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
213 {
214         struct buf *bp;
215         char *compressed_buffer;
216         z_stream strm_decompress;
217         int result;
218         int ret;
219
220         bp = bio->bio_buf;
221
222         KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
223         strm_decompress.avail_in = 0;
224         strm_decompress.next_in = Z_NULL;
225
226         ret = inflateInit(&strm_decompress);
227
228         if (ret != Z_OK)
229                 kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
230
231         compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
232         strm_decompress.next_in = __DECONST(char *, data);
233
234         /* XXX supply proper size, subset of device bp */
235         strm_decompress.avail_in = bytes;
236         strm_decompress.next_out = compressed_buffer;
237         strm_decompress.avail_out = bp->b_bufsize;
238
239         ret = inflate(&strm_decompress, Z_FINISH);
240         if (ret != Z_STREAM_END) {
241                 kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
242                 bzero(compressed_buffer, bp->b_bufsize);
243         }
244         bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
245         result = bp->b_bufsize - strm_decompress.avail_out;
246         if (result < bp->b_bufsize)
247                 bzero(bp->b_data + result, strm_decompress.avail_out);
248         objcache_put(cache_buffer_read, compressed_buffer);
249         ret = inflateEnd(&strm_decompress);
250
251         bp->b_resid = 0;
252         bp->b_flags |= B_AGE;
253 }
254
255 /*
256  * Logical buffer I/O, async read.
257  */
258 static
259 int
260 hammer2_strategy_read(struct vop_strategy_args *ap)
261 {
262         hammer2_xop_strategy_t *xop;
263         struct buf *bp;
264         struct bio *bio;
265         struct bio *nbio;
266         hammer2_inode_t *ip;
267         hammer2_key_t lbase;
268
269         bio = ap->a_bio;
270         bp = bio->bio_buf;
271         ip = VTOI(ap->a_vp);
272         nbio = push_bio(bio);
273
274         lbase = bio->bio_offset;
275         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
276
277         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_STRATEGY);
278         xop->finished = 0;
279         xop->bio = bio;
280         xop->lbase = lbase;
281         hammer2_mtx_init(&xop->lock, "h2bior");
282         hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
283         /* asynchronous completion */
284
285         return(0);
286 }
287
288 /*
289  * Per-node XOP (threaded), do a synchronous lookup of the chain and
290  * its data.  The frontend is asynchronous, so we are also responsible
291  * for racing to terminate the frontend.
292  */
293 static
294 void
295 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
296 {
297         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
298         hammer2_chain_t *parent;
299         hammer2_chain_t *chain;
300         hammer2_key_t key_dummy;
301         hammer2_key_t lbase;
302         struct bio *bio;
303         struct buf *bp;
304         int cache_index = -1;
305         int error;
306
307         TIMER(0);
308         lbase = xop->lbase;
309         bio = xop->bio;
310         bp = bio->bio_buf;
311
312         /*
313          * This is difficult to optimize.  The logical buffer might be
314          * partially dirty (contain dummy zero-fill pages), which would
315          * mess up our crc calculation if we were to try a direct read.
316          * So for now we always double-buffer through the underlying
317          * storage.
318          *
319          * If not for the above problem we could conditionalize on
320          * (1) 64KB buffer, (2) one chain (not multi-master) and
321          * (3) !hammer2_double_buffer, and issue a direct read into the
322          * logical buffer.
323          */
324         parent = hammer2_inode_chain(xop->head.ip1, clindex,
325                                      HAMMER2_RESOLVE_ALWAYS |
326                                      HAMMER2_RESOLVE_SHARED);
327         TIMER(1);
328         if (parent) {
329                 chain = hammer2_chain_lookup(&parent, &key_dummy,
330                                              lbase, lbase,
331                                              &cache_index,
332                                              HAMMER2_LOOKUP_ALWAYS |
333                                              HAMMER2_LOOKUP_SHARED);
334                 error = chain ? chain->error : 0;
335         } else {
336                 error = EIO;
337                 chain = NULL;
338         }
339         TIMER(2);
340         error = hammer2_xop_feed(&xop->head, chain, clindex, error);
341         TIMER(3);
342         if (chain) {
343                 hammer2_chain_unlock(chain);
344                 hammer2_chain_drop(chain);
345         }
346         if (parent) {
347                 hammer2_chain_unlock(parent);
348                 hammer2_chain_drop(parent);
349         }
350         chain = NULL;   /* safety */
351         parent = NULL;  /* safety */
352         TIMER(4);
353
354         /*
355          * Race to finish the frontend
356          */
357         if (xop->finished)
358                 return;
359         hammer2_mtx_ex(&xop->lock);
360         if (xop->finished) {
361                 hammer2_mtx_unlock(&xop->lock);
362                 return;
363         }
364
365         /*
366          * Async operation has not completed and we now own the lock.
367          * Determine if we can complete the operation by issuing the
368          * frontend collection non-blocking.
369          *
370          * H2 double-buffers the data, setting B_NOTMETA on the logical
371          * buffer hints to the OS that the logical buffer should not be
372          * swapcached (since the device buffer can be).
373          *
374          * Also note that even for compressed data we would rather the
375          * kernel cache/swapcache device buffers more and (decompressed)
376          * logical buffers less, since that will significantly improve
377          * the amount of end-user data that can be cached.
378          */
379         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
380         TIMER(5);
381
382         switch(error) {
383         case 0:
384                 xop->finished = 1;
385                 hammer2_mtx_unlock(&xop->lock);
386                 bp->b_flags |= B_NOTMETA;
387                 chain = xop->head.cluster.focus;
388                 hammer2_strategy_read_completion(chain, (char *)chain->data,
389                                                  xop->bio);
390                 biodone(bio);
391                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
392                 break;
393         case ENOENT:
394                 xop->finished = 1;
395                 hammer2_mtx_unlock(&xop->lock);
396                 bp->b_flags |= B_NOTMETA;
397                 bp->b_resid = 0;
398                 bp->b_error = 0;
399                 bzero(bp->b_data, bp->b_bcount);
400                 biodone(bio);
401                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
402                 break;
403         case EINPROGRESS:
404                 hammer2_mtx_unlock(&xop->lock);
405                 break;
406         default:
407                 kprintf("strategy_xop_read: error %d loff=%016jx\n",
408                         error, bp->b_loffset);
409                 xop->finished = 1;
410                 hammer2_mtx_unlock(&xop->lock);
411                 bp->b_flags |= B_ERROR;
412                 bp->b_error = EIO;
413                 biodone(bio);
414                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
415                 break;
416         }
417         TIMER(6);
418 }
419
420 static
421 void
422 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
423                                  struct bio *bio)
424 {
425         struct buf *bp = bio->bio_buf;
426
427         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
428                 /*
429                  * Data is embedded in the inode (copy from inode).
430                  */
431                 bcopy(((hammer2_inode_data_t *)data)->u.data,
432                       bp->b_data, HAMMER2_EMBEDDED_BYTES);
433                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
434                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
435                 bp->b_resid = 0;
436                 bp->b_error = 0;
437         } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
438                 /*
439                  * Data is on-media, record for live dedup.  Release the
440                  * chain (try to free it) when done.  The data is still
441                  * cached by both the buffer cache in front and the
442                  * block device behind us.  This leaves more room in the
443                  * LRU chain cache for meta-data chains which we really
444                  * want to retain.
445                  */
446                 hammer2_dedup_record(chain, data);
447                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
448
449                 /*
450                  * Decompression and copy.
451                  */
452                 switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
453                 case HAMMER2_COMP_LZ4:
454                         hammer2_decompress_LZ4_callback(data, chain->bytes,
455                                                         bio);
456                         /* b_resid set by call */
457                         break;
458                 case HAMMER2_COMP_ZLIB:
459                         hammer2_decompress_ZLIB_callback(data, chain->bytes,
460                                                          bio);
461                         /* b_resid set by call */
462                         break;
463                 case HAMMER2_COMP_NONE:
464                         KKASSERT(chain->bytes <= bp->b_bcount);
465                         bcopy(data, bp->b_data, chain->bytes);
466                         if (chain->bytes < bp->b_bcount) {
467                                 bzero(bp->b_data + chain->bytes,
468                                       bp->b_bcount - chain->bytes);
469                         }
470                         bp->b_resid = 0;
471                         bp->b_error = 0;
472                         break;
473                 default:
474                         panic("hammer2_strategy_read: "
475                               "unknown compression type");
476                 }
477         } else {
478                 panic("hammer2_strategy_read: unknown bref type");
479         }
480 }
481
482 /****************************************************************************
483  *                              WRITE SUPPORT                               *
484  ****************************************************************************/
485
486 /* 
487  * Functions for compression in threads,
488  * from hammer2_vnops.c
489  */
490 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
491                                 hammer2_chain_t **parentp,
492                                 hammer2_key_t lbase, int ioflag, int pblksize,
493                                 hammer2_tid_t mtid, int *errorp);
494 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
495                                 hammer2_chain_t **parentp,
496                                 hammer2_key_t lbase, int ioflag, int pblksize,
497                                 hammer2_tid_t mtid, int *errorp,
498                                 int comp_algo, int check_algo);
499 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
500                                 hammer2_chain_t **parentp,
501                                 hammer2_key_t lbase, int ioflag, int pblksize,
502                                 hammer2_tid_t mtid, int *errorp,
503                                 int check_algo);
504 static int test_block_zeros(const char *buf, size_t bytes);
505 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
506                                 hammer2_chain_t **parentp,
507                                 hammer2_key_t lbase,
508                                 hammer2_tid_t mtid, int *errorp);
509 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
510                                 int ioflag, int pblksize,
511                                 hammer2_tid_t mtid, int *errorp,
512                                 int check_algo);
513
514 static
515 int
516 hammer2_strategy_write(struct vop_strategy_args *ap)
517 {       
518         hammer2_xop_strategy_t *xop;
519         hammer2_pfs_t *pmp;
520         struct bio *bio;
521         struct buf *bp;
522         hammer2_inode_t *ip;
523         
524         bio = ap->a_bio;
525         bp = bio->bio_buf;
526         ip = VTOI(ap->a_vp);
527         pmp = ip->pmp;
528         
529         hammer2_lwinprog_ref(pmp);
530         hammer2_trans_assert_strategy(pmp);
531         hammer2_trans_init(pmp, HAMMER2_TRANS_BUFCACHE);
532
533         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
534                                     HAMMER2_XOP_STRATEGY);
535         xop->finished = 0;
536         xop->bio = bio;
537         xop->lbase = bio->bio_offset;
538         hammer2_mtx_init(&xop->lock, "h2biow");
539         hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
540         /* asynchronous completion */
541
542         hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
543
544         return(0);
545 }
546
547 /*
548  * Per-node XOP (threaded).  Write the logical buffer to the media.
549  */
550 static
551 void
552 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
553 {
554         hammer2_xop_strategy_t *xop = &arg->xop_strategy;
555         hammer2_chain_t *parent;
556         hammer2_key_t lbase;
557         hammer2_inode_t *ip;
558         struct bio *bio;
559         struct buf *bp;
560         int error;
561         int lblksize;
562         int pblksize;
563
564         lbase = xop->lbase;
565         bio = xop->bio;
566         bp = bio->bio_buf;
567         ip = xop->head.ip1;
568
569         /* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
570
571         lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
572         pblksize = hammer2_calc_physical(ip, lbase);
573         parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
574         hammer2_write_file_core(bp, ip, &parent,
575                                 lbase, IO_ASYNC, pblksize,
576                                 xop->head.mtid, &error);
577         if (parent) {
578                 hammer2_chain_unlock(parent);
579                 hammer2_chain_drop(parent);
580                 parent = NULL;  /* safety */
581         }
582         hammer2_xop_feed(&xop->head, NULL, clindex, error);
583
584         /*
585          * Race to finish the frontend
586          */
587         if (xop->finished)
588                 return;
589         hammer2_mtx_ex(&xop->lock);
590         if (xop->finished) {
591                 hammer2_mtx_unlock(&xop->lock);
592                 return;
593         }
594
595         /*
596          * Async operation has not completed and we now own the lock.
597          * Determine if we can complete the operation by issuing the
598          * frontend collection non-blocking.
599          *
600          * H2 double-buffers the data, setting B_NOTMETA on the logical
601          * buffer hints to the OS that the logical buffer should not be
602          * swapcached (since the device buffer can be).
603          */
604         error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
605
606         if (error == EINPROGRESS) {
607                 hammer2_mtx_unlock(&xop->lock);
608                 return;
609         }
610
611         /*
612          * Async operation has completed.
613          */
614         xop->finished = 1;
615         hammer2_mtx_unlock(&xop->lock);
616
617         if (error == ENOENT || error == 0) {
618                 bp->b_flags |= B_NOTMETA;
619                 bp->b_resid = 0;
620                 bp->b_error = 0;
621                 biodone(bio);
622         } else {
623                 kprintf("strategy_xop_write: error %d loff=%016jx\n",
624                         error, bp->b_loffset);
625                 bp->b_flags |= B_ERROR;
626                 bp->b_error = EIO;
627                 biodone(bio);
628         }
629         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
630         hammer2_trans_assert_strategy(ip->pmp);
631         hammer2_lwinprog_drop(ip->pmp);
632         hammer2_trans_done(ip->pmp);
633 }
634
635 /*
636  * Wait for pending I/O to complete
637  */
638 void
639 hammer2_bioq_sync(hammer2_pfs_t *pmp)
640 {
641         hammer2_lwinprog_wait(pmp, 0);
642 }
643
644 /* 
645  * Create a new cluster at (cparent, lbase) and assign physical storage,
646  * returning a cluster suitable for I/O.  The cluster will be in a modified
647  * state.
648  *
649  * cparent can wind up being anything.
650  *
651  * If datap is not NULL, *datap points to the real data we intend to write.
652  * If we can dedup the storage location we set *datap to NULL to indicate
653  * to the caller that a dedup occurred.
654  *
655  * NOTE: Special case for data embedded in inode.
656  */
657 static
658 hammer2_chain_t *
659 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
660                         hammer2_key_t lbase, int pblksize,
661                         hammer2_tid_t mtid, char **datap, int *errorp)
662 {
663         hammer2_chain_t *chain;
664         hammer2_key_t key_dummy;
665         hammer2_off_t dedup_off;
666         int pradix = hammer2_getradix(pblksize);
667         int cache_index = -1;
668
669         /*
670          * Locate the chain associated with lbase, return a locked chain.
671          * However, do not instantiate any data reference (which utilizes a
672          * device buffer) because we will be using direct IO via the
673          * logical buffer cache buffer.
674          */
675         *errorp = 0;
676         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
677 retry:
678         TIMER(30);
679         chain = hammer2_chain_lookup(parentp, &key_dummy,
680                                      lbase, lbase,
681                                      &cache_index,
682                                      HAMMER2_LOOKUP_NODATA);
683
684         /*
685          * The lookup code should not return a DELETED chain to us, unless
686          * its a short-file embedded in the inode.  Then it is possible for
687          * the lookup to return a deleted inode.
688          */
689         if (chain && (chain->flags & HAMMER2_CHAIN_DELETED) &&
690             chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
691                 kprintf("assign physical deleted chain @ "
692                         "%016jx (%016jx.%02x) ip %016jx\n",
693                         lbase, chain->bref.data_off, chain->bref.type,
694                         ip->meta.inum);
695                 Debugger("bleh");
696         }
697
698         if (chain == NULL) {
699                 /*
700                  * We found a hole, create a new chain entry.
701                  *
702                  * NOTE: DATA chains are created without device backing
703                  *       store (nor do we want any).
704                  */
705                 dedup_off = hammer2_dedup_lookup((*parentp)->hmp, datap,
706                                                  pblksize);
707                 *errorp = hammer2_chain_create(parentp, &chain,
708                                                ip->pmp,
709                                        HAMMER2_ENC_CHECK(ip->meta.check_algo) |
710                                        HAMMER2_ENC_COMP(HAMMER2_COMP_NONE),
711                                                lbase, HAMMER2_PBUFRADIX,
712                                                HAMMER2_BREF_TYPE_DATA,
713                                                pblksize, mtid,
714                                                dedup_off, 0);
715                 if (chain == NULL) {
716                         panic("hammer2_chain_create: par=%p error=%d\n",
717                               *parentp, *errorp);
718                         goto retry;
719                 }
720                 /*ip->delta_dcount += pblksize;*/
721         } else {
722                 switch (chain->bref.type) {
723                 case HAMMER2_BREF_TYPE_INODE:
724                         /*
725                          * The data is embedded in the inode, which requires
726                          * a bit more finess.
727                          */
728                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
729                         break;
730                 case HAMMER2_BREF_TYPE_DATA:
731                         dedup_off = hammer2_dedup_lookup(chain->hmp, datap,
732                                                          pblksize);
733                         if (chain->bytes != pblksize) {
734                                 hammer2_chain_resize(ip, *parentp, chain,
735                                                      mtid, dedup_off,
736                                                      pradix,
737                                                      HAMMER2_MODIFY_OPTDATA);
738                         }
739
740                         /*
741                          * DATA buffers must be marked modified whether the
742                          * data is in a logical buffer or not.  We also have
743                          * to make this call to fixup the chain data pointers
744                          * after resizing in case this is an encrypted or
745                          * compressed buffer.
746                          */
747                         hammer2_chain_modify(chain, mtid, dedup_off,
748                                              HAMMER2_MODIFY_OPTDATA);
749                         break;
750                 default:
751                         panic("hammer2_assign_physical: bad type");
752                         /* NOT REACHED */
753                         break;
754                 }
755         }
756         TIMER(31);
757         return (chain);
758 }
759
760 /* 
761  * hammer2_write_file_core() - hammer2_write_thread() helper
762  *
763  * The core write function which determines which path to take
764  * depending on compression settings.  We also have to locate the
765  * related chains so we can calculate and set the check data for
766  * the blockref.
767  */
768 static
769 void
770 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
771                         hammer2_chain_t **parentp,
772                         hammer2_key_t lbase, int ioflag, int pblksize,
773                         hammer2_tid_t mtid, int *errorp)
774 {
775         hammer2_chain_t *chain;
776         char *data = bp->b_data;
777
778         *errorp = 0;
779
780         switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
781         case HAMMER2_COMP_NONE:
782                 /*
783                  * We have to assign physical storage to the buffer
784                  * we intend to dirty or write now to avoid deadlocks
785                  * in the strategy code later.
786                  *
787                  * This can return NOOFFSET for inode-embedded data.
788                  * The strategy code will take care of it in that case.
789                  */
790                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
791                                                 mtid, &data, errorp);
792                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
793                         hammer2_inode_data_t *wipdata;
794
795                         wipdata = &chain->data->ipdata;
796                         KKASSERT(wipdata->meta.op_flags &
797                                  HAMMER2_OPFLAG_DIRECTDATA);
798                         KKASSERT(bp->b_loffset == 0);
799                         bcopy(bp->b_data, wipdata->u.data,
800                               HAMMER2_EMBEDDED_BYTES);
801                         ++hammer2_iod_file_wembed;
802                 } else if (data == NULL) {
803                         /*
804                          * Copy of data already present on-media.
805                          */
806                         chain->bref.methods =
807                                 HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
808                                 HAMMER2_ENC_CHECK(ip->meta.check_algo);
809                         hammer2_chain_setcheck(chain, bp->b_data);
810                 } else {
811                         hammer2_write_bp(chain, bp, ioflag, pblksize,
812                                          mtid, errorp, ip->meta.check_algo);
813                 }
814                 if (chain) {
815                         hammer2_chain_unlock(chain);
816                         hammer2_chain_drop(chain);
817                 }
818                 break;
819         case HAMMER2_COMP_AUTOZERO:
820                 /*
821                  * Check for zero-fill only
822                  */
823                 hammer2_zero_check_and_write(bp, ip, parentp,
824                                              lbase, ioflag, pblksize,
825                                              mtid, errorp,
826                                              ip->meta.check_algo);
827                 break;
828         case HAMMER2_COMP_LZ4:
829         case HAMMER2_COMP_ZLIB:
830         default:
831                 /*
832                  * Check for zero-fill and attempt compression.
833                  */
834                 hammer2_compress_and_write(bp, ip, parentp,
835                                            lbase, ioflag, pblksize,
836                                            mtid, errorp,
837                                            ip->meta.comp_algo,
838                                            ip->meta.check_algo);
839                 break;
840         }
841 }
842
843 /*
844  * Helper
845  *
846  * Generic function that will perform the compression in compression
847  * write path. The compression algorithm is determined by the settings
848  * obtained from inode.
849  */
850 static
851 void
852 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
853         hammer2_chain_t **parentp,
854         hammer2_key_t lbase, int ioflag, int pblksize,
855         hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo)
856 {
857         hammer2_chain_t *chain;
858         int comp_size;
859         int comp_block_size;
860         char *comp_buffer;
861         char *data;
862
863         if (test_block_zeros(bp->b_data, pblksize)) {
864                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
865                 return;
866         }
867
868         comp_size = 0;
869         comp_buffer = NULL;
870
871         KKASSERT(pblksize / 2 <= 32768);
872                 
873         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
874                 z_stream strm_compress;
875                 int comp_level;
876                 int ret;
877
878                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
879                 case HAMMER2_COMP_LZ4:
880                         comp_buffer = objcache_get(cache_buffer_write,
881                                                    M_INTWAIT);
882                         comp_size = LZ4_compress_limitedOutput(
883                                         bp->b_data,
884                                         &comp_buffer[sizeof(int)],
885                                         pblksize,
886                                         pblksize / 2 - sizeof(int));
887                         /*
888                          * We need to prefix with the size, LZ4
889                          * doesn't do it for us.  Add the related
890                          * overhead.
891                          */
892                         *(int *)comp_buffer = comp_size;
893                         if (comp_size)
894                                 comp_size += sizeof(int);
895                         break;
896                 case HAMMER2_COMP_ZLIB:
897                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
898                         if (comp_level == 0)
899                                 comp_level = 6; /* default zlib compression */
900                         else if (comp_level < 6)
901                                 comp_level = 6;
902                         else if (comp_level > 9)
903                                 comp_level = 9;
904                         ret = deflateInit(&strm_compress, comp_level);
905                         if (ret != Z_OK) {
906                                 kprintf("HAMMER2 ZLIB: fatal error "
907                                         "on deflateInit.\n");
908                         }
909
910                         comp_buffer = objcache_get(cache_buffer_write,
911                                                    M_INTWAIT);
912                         strm_compress.next_in = bp->b_data;
913                         strm_compress.avail_in = pblksize;
914                         strm_compress.next_out = comp_buffer;
915                         strm_compress.avail_out = pblksize / 2;
916                         ret = deflate(&strm_compress, Z_FINISH);
917                         if (ret == Z_STREAM_END) {
918                                 comp_size = pblksize / 2 -
919                                             strm_compress.avail_out;
920                         } else {
921                                 comp_size = 0;
922                         }
923                         ret = deflateEnd(&strm_compress);
924                         break;
925                 default:
926                         kprintf("Error: Unknown compression method.\n");
927                         kprintf("Comp_method = %d.\n", comp_algo);
928                         break;
929                 }
930         }
931
932         if (comp_size == 0) {
933                 /*
934                  * compression failed or turned off
935                  */
936                 comp_block_size = pblksize;     /* safety */
937                 if (++ip->comp_heuristic > 128)
938                         ip->comp_heuristic = 8;
939         } else {
940                 /*
941                  * compression succeeded
942                  */
943                 ip->comp_heuristic = 0;
944                 if (comp_size <= 1024) {
945                         comp_block_size = 1024;
946                 } else if (comp_size <= 2048) {
947                         comp_block_size = 2048;
948                 } else if (comp_size <= 4096) {
949                         comp_block_size = 4096;
950                 } else if (comp_size <= 8192) {
951                         comp_block_size = 8192;
952                 } else if (comp_size <= 16384) {
953                         comp_block_size = 16384;
954                 } else if (comp_size <= 32768) {
955                         comp_block_size = 32768;
956                 } else {
957                         panic("hammer2: WRITE PATH: "
958                               "Weird comp_size value.");
959                         /* NOT REACHED */
960                         comp_block_size = pblksize;
961                 }
962
963                 /*
964                  * Must zero the remainder or dedup (which operates on a
965                  * physical block basis) will not find matches.
966                  */
967                 if (comp_size < comp_block_size) {
968                         bzero(comp_buffer + comp_size,
969                               comp_block_size - comp_size);
970                 }
971         }
972
973         /*
974          * Assign physical storage, data will be set to NULL if a live-dedup
975          * was successful.
976          */
977         data = comp_size ? comp_buffer : bp->b_data;
978         chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size,
979                                         mtid, &data, errorp);
980
981         if (*errorp) {
982                 kprintf("WRITE PATH: An error occurred while "
983                         "assigning physical space.\n");
984                 KKASSERT(chain == NULL);
985                 goto done;
986         }
987
988         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
989                 hammer2_inode_data_t *wipdata;
990
991                 hammer2_chain_modify_ip(ip, chain, mtid, 0);
992                 wipdata = &chain->data->ipdata;
993                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
994                 KKASSERT(bp->b_loffset == 0);
995                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
996                 ++hammer2_iod_file_wembed;
997         } else if (data == NULL) {
998                 /*
999                  * Live deduplication, a copy of the data is already present
1000                  * on the media.
1001                  */
1002                 char *bdata;
1003
1004                 if (comp_size) {
1005                         chain->bref.methods =
1006                                 HAMMER2_ENC_COMP(comp_algo) +
1007                                 HAMMER2_ENC_CHECK(check_algo);
1008                 } else {
1009                         chain->bref.methods =
1010                                 HAMMER2_ENC_COMP(
1011                                         HAMMER2_COMP_NONE) +
1012                                 HAMMER2_ENC_CHECK(check_algo);
1013                 }
1014                 bdata = comp_size ? comp_buffer : bp->b_data;
1015                 hammer2_chain_setcheck(chain, bdata);
1016                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1017         } else {
1018                 hammer2_io_t *dio;
1019                 char *bdata;
1020
1021                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1022
1023                 switch(chain->bref.type) {
1024                 case HAMMER2_BREF_TYPE_INODE:
1025                         panic("hammer2_write_bp: unexpected inode\n");
1026                         break;
1027                 case HAMMER2_BREF_TYPE_DATA:
1028                         /*
1029                          * Optimize out the read-before-write
1030                          * if possible.
1031                          */
1032                         *errorp = hammer2_io_newnz(chain->hmp,
1033                                                    chain->bref.type,
1034                                                    chain->bref.data_off,
1035                                                    chain->bytes,
1036                                                    &dio);
1037                         if (*errorp) {
1038                                 hammer2_io_brelse(&dio);
1039                                 kprintf("hammer2: WRITE PATH: "
1040                                         "dbp bread error\n");
1041                                 break;
1042                         }
1043                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1044
1045                         /*
1046                          * When loading the block make sure we don't
1047                          * leave garbage after the compressed data.
1048                          */
1049                         if (comp_size) {
1050                                 chain->bref.methods =
1051                                         HAMMER2_ENC_COMP(comp_algo) +
1052                                         HAMMER2_ENC_CHECK(check_algo);
1053                                 bcopy(comp_buffer, bdata, comp_size);
1054                         } else {
1055                                 chain->bref.methods =
1056                                         HAMMER2_ENC_COMP(
1057                                                 HAMMER2_COMP_NONE) +
1058                                         HAMMER2_ENC_CHECK(check_algo);
1059                                 bcopy(bp->b_data, bdata, pblksize);
1060                         }
1061
1062                         /*
1063                          * The flush code doesn't calculate check codes for
1064                          * file data (doing so can result in excessive I/O),
1065                          * so we do it here.
1066                          */
1067                         hammer2_chain_setcheck(chain, bdata);
1068                         hammer2_dedup_record(chain, bdata);
1069
1070                         /*
1071                          * Device buffer is now valid, chain is no longer in
1072                          * the initial state.
1073                          *
1074                          * (No blockref table worries with file data)
1075                          */
1076                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1077
1078                         /* Now write the related bdp. */
1079                         if (ioflag & IO_SYNC) {
1080                                 /*
1081                                  * Synchronous I/O requested.
1082                                  */
1083                                 hammer2_io_bwrite(&dio);
1084                         /*
1085                         } else if ((ioflag & IO_DIRECT) &&
1086                                    loff + n == pblksize) {
1087                                 hammer2_io_bdwrite(&dio);
1088                         */
1089                         } else if (ioflag & IO_ASYNC) {
1090                                 hammer2_io_bawrite(&dio);
1091                         } else {
1092                                 hammer2_io_bdwrite(&dio);
1093                         }
1094                         break;
1095                 default:
1096                         panic("hammer2_write_bp: bad chain type %d\n",
1097                                 chain->bref.type);
1098                         /* NOT REACHED */
1099                         break;
1100                 }
1101         }
1102 done:
1103         if (chain) {
1104                 hammer2_chain_unlock(chain);
1105                 hammer2_chain_drop(chain);
1106         }
1107         if (comp_buffer)
1108                 objcache_put(cache_buffer_write, comp_buffer);
1109 }
1110
1111 /*
1112  * Helper
1113  *
1114  * Function that performs zero-checking and writing without compression,
1115  * it corresponds to default zero-checking path.
1116  */
1117 static
1118 void
1119 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
1120         hammer2_chain_t **parentp,
1121         hammer2_key_t lbase, int ioflag, int pblksize,
1122         hammer2_tid_t mtid, int *errorp,
1123         int check_algo)
1124 {
1125         hammer2_chain_t *chain;
1126         char *data = bp->b_data;
1127
1128         if (test_block_zeros(bp->b_data, pblksize)) {
1129                 zero_write(bp, ip, parentp, lbase, mtid, errorp);
1130         } else {
1131                 chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
1132                                                 mtid, &data, errorp);
1133                 if (data) {
1134                         hammer2_write_bp(chain, bp, ioflag, pblksize,
1135                                          mtid, errorp, check_algo);
1136                 } /* else dedup occurred */
1137                 if (chain) {
1138                         hammer2_chain_unlock(chain);
1139                         hammer2_chain_drop(chain);
1140                 }
1141         }
1142 }
1143
1144 /*
1145  * Helper
1146  *
1147  * A function to test whether a block of data contains only zeros,
1148  * returns TRUE (non-zero) if the block is all zeros.
1149  */
1150 static
1151 int
1152 test_block_zeros(const char *buf, size_t bytes)
1153 {
1154         size_t i;
1155
1156         for (i = 0; i < bytes; i += sizeof(long)) {
1157                 if (*(const long *)(buf + i) != 0)
1158                         return (0);
1159         }
1160         return (1);
1161 }
1162
1163 /*
1164  * Helper
1165  *
1166  * Function to "write" a block that contains only zeros.
1167  */
1168 static
1169 void
1170 zero_write(struct buf *bp, hammer2_inode_t *ip,
1171            hammer2_chain_t **parentp,
1172            hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp)
1173 {
1174         hammer2_chain_t *chain;
1175         hammer2_key_t key_dummy;
1176         int cache_index = -1;
1177
1178         *errorp = 0;
1179         chain = hammer2_chain_lookup(parentp, &key_dummy,
1180                                      lbase, lbase,
1181                                      &cache_index,
1182                                      HAMMER2_LOOKUP_NODATA);
1183         if (chain) {
1184                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1185                         hammer2_inode_data_t *wipdata;
1186
1187                         hammer2_chain_modify_ip(ip, chain, mtid, 0);
1188                         wipdata = &chain->data->ipdata;
1189                         KKASSERT(wipdata->meta.op_flags &
1190                                  HAMMER2_OPFLAG_DIRECTDATA);
1191                         KKASSERT(bp->b_loffset == 0);
1192                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1193                         ++hammer2_iod_file_wembed;
1194                 } else {
1195                         hammer2_chain_delete(*parentp, chain,
1196                                              mtid, HAMMER2_DELETE_PERMANENT);
1197                         ++hammer2_iod_file_wzero;
1198                 }
1199                 hammer2_chain_unlock(chain);
1200                 hammer2_chain_drop(chain);
1201         } else {
1202                 ++hammer2_iod_file_wzero;
1203         }
1204 }
1205
1206 /*
1207  * Helper
1208  *
1209  * Function to write the data as it is, without performing any sort of
1210  * compression. This function is used in path without compression and
1211  * default zero-checking path.
1212  */
1213 static
1214 void
1215 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1216                  int pblksize,
1217                  hammer2_tid_t mtid, int *errorp, int check_algo)
1218 {
1219         hammer2_inode_data_t *wipdata;
1220         hammer2_io_t *dio;
1221         char *bdata;
1222         int error;
1223
1224         error = 0;      /* XXX TODO below */
1225
1226         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1227
1228         switch(chain->bref.type) {
1229         case HAMMER2_BREF_TYPE_INODE:
1230                 wipdata = &chain->data->ipdata;
1231                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1232                 KKASSERT(bp->b_loffset == 0);
1233                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1234                 error = 0;
1235                 ++hammer2_iod_file_wembed;
1236                 break;
1237         case HAMMER2_BREF_TYPE_DATA:
1238                 error = hammer2_io_newnz(chain->hmp,
1239                                          chain->bref.type,
1240                                          chain->bref.data_off,
1241                                          chain->bytes, &dio);
1242                 if (error) {
1243                         hammer2_io_bqrelse(&dio);
1244                         kprintf("hammer2: WRITE PATH: "
1245                                 "dbp bread error\n");
1246                         break;
1247                 }
1248                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1249
1250                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1251                                       HAMMER2_ENC_CHECK(check_algo);
1252                 bcopy(bp->b_data, bdata, chain->bytes);
1253
1254                 /*
1255                  * The flush code doesn't calculate check codes for
1256                  * file data (doing so can result in excessive I/O),
1257                  * so we do it here.
1258                  */
1259                 hammer2_chain_setcheck(chain, bdata);
1260                 hammer2_dedup_record(chain, bdata);
1261
1262                 /*
1263                  * Device buffer is now valid, chain is no longer in
1264                  * the initial state.
1265                  *
1266                  * (No blockref table worries with file data)
1267                  */
1268                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1269
1270                 if (ioflag & IO_SYNC) {
1271                         /*
1272                          * Synchronous I/O requested.
1273                          */
1274                         hammer2_io_bwrite(&dio);
1275                 /*
1276                 } else if ((ioflag & IO_DIRECT) &&
1277                            loff + n == pblksize) {
1278                         hammer2_io_bdwrite(&dio);
1279                 */
1280                 } else if (ioflag & IO_ASYNC) {
1281                         hammer2_io_bawrite(&dio);
1282                 } else {
1283                         hammer2_io_bdwrite(&dio);
1284                 }
1285                 break;
1286         default:
1287                 panic("hammer2_write_bp: bad chain type %d\n",
1288                       chain->bref.type);
1289                 /* NOT REACHED */
1290                 error = 0;
1291                 break;
1292         }
1293         KKASSERT(error == 0);   /* XXX TODO */
1294         *errorp = error;
1295 }
1296
1297 /*
1298  * LIVE DEDUP HEURISTIC
1299  *
1300  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
1301  *          All fields must be loaded into locals and validated.
1302  *
1303  * WARNING! Should only be used for file data, hammer2_chain_modify() only
1304  *          checks for the dedup case on data chains.  Also, dedup data can
1305  *          only be recorded for committed chains (so NOT strategy writes
1306  *          which can undergo further modification after the fact!).
1307  */
1308 void
1309 hammer2_dedup_record(hammer2_chain_t *chain, char *data)
1310 {
1311         hammer2_dev_t *hmp;
1312         hammer2_dedup_t *dedup;
1313         uint64_t crc;
1314         int best = 0;
1315         int i;
1316         int dticks;
1317
1318         if (hammer2_dedup_enable == 0)
1319                 return;
1320
1321         /*
1322          * Only committed data can be recorded for de-duplication, otherwise
1323          * the contents may change out from under us.  So, on read if the
1324          * chain is not modified, and on flush when the chain is committed.
1325          */
1326         if ((chain->flags &
1327             (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_INITIAL)) == 0) {
1328                 return;
1329         }
1330
1331
1332         hmp = chain->hmp;
1333
1334         switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
1335         case HAMMER2_CHECK_ISCSI32:
1336                 /*
1337                  * XXX use the built-in crc (the dedup lookup sequencing
1338                  * needs to be fixed so the check code is already present
1339                  * when dedup_lookup is called)
1340                  */
1341 #if 0
1342                 crc = (uint64_t)(uint32_t)chain->bref.check.iscsi32.value;
1343 #endif
1344                 crc = XXH64(data, chain->bytes, XXH_HAMMER2_SEED);
1345                 break;
1346         case HAMMER2_CHECK_XXHASH64:
1347                 crc = chain->bref.check.xxhash64.value;
1348                 break;
1349         case HAMMER2_CHECK_SHA192:
1350                 /*
1351                  * XXX use the built-in crc (the dedup lookup sequencing
1352                  * needs to be fixed so the check code is already present
1353                  * when dedup_lookup is called)
1354                  */
1355 #if 0
1356                 crc = ((uint64_t *)chain->bref.check.sha192.data)[0] ^
1357                       ((uint64_t *)chain->bref.check.sha192.data)[1] ^
1358                       ((uint64_t *)chain->bref.check.sha192.data)[2];
1359 #endif
1360                 crc = XXH64(data, chain->bytes, XXH_HAMMER2_SEED);
1361                 break;
1362         default:
1363                 /*
1364                  * Cannot dedup without a check code
1365                  *
1366                  * NOTE: In particular, CHECK_NONE allows a sector to be
1367                  *       overwritten without copy-on-write, recording
1368                  *       a dedup block for a CHECK_NONE object would be
1369                  *       a disaster!
1370                  */
1371                 return;
1372         }
1373         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1374         for (i = 0; i < 4; ++i) {
1375                 if (dedup[i].data_crc == crc) {
1376                         best = i;
1377                         break;
1378                 }
1379                 dticks = (int)(dedup[i].ticks - dedup[best].ticks);
1380                 if (dticks < 0 || dticks > hz * 60 * 30)
1381                         best = i;
1382         }
1383         dedup += best;
1384         if (hammer2_debug & 0x40000) {
1385                 kprintf("REC %04x %016jx %016jx\n",
1386                         (int)(dedup - hmp->heur_dedup),
1387                         crc,
1388                         chain->bref.data_off);
1389         }
1390         dedup->ticks = ticks;
1391         dedup->data_off = chain->bref.data_off;
1392         dedup->data_crc = crc;
1393         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP);
1394 }
1395
1396 static
1397 hammer2_off_t
1398 hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize)
1399 {
1400         hammer2_dedup_t *dedup;
1401         hammer2_io_t *dio;
1402         hammer2_off_t off;
1403         uint64_t crc;
1404         char *data;
1405         int i;
1406
1407         if (hammer2_dedup_enable == 0)
1408                 return 0;
1409         data = *datap;
1410         if (data == NULL)
1411                 return 0;
1412
1413         /*
1414          * XXX use the built-in crc (the dedup lookup sequencing
1415          * needs to be fixed so the check code is already present
1416          * when dedup_lookup is called)
1417          */
1418         crc = XXH64(data, pblksize, XXH_HAMMER2_SEED);
1419         dedup = &hmp->heur_dedup[crc & (HAMMER2_DEDUP_HEUR_MASK & ~3)];
1420
1421         if (hammer2_debug & 0x40000) {
1422                 kprintf("LOC %04x/4 %016jx\n",
1423                         (int)(dedup - hmp->heur_dedup),
1424                         crc);
1425         }
1426
1427         for (i = 0; i < 4; ++i) {
1428                 off = dedup[i].data_off;
1429                 cpu_ccfence();
1430                 if (dedup[i].data_crc != crc)
1431                         continue;
1432                 if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize)
1433                         continue;
1434                 dio = hammer2_io_getquick(hmp, off, pblksize);
1435                 if (dio &&
1436                     bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) {
1437                         /*
1438                          * Make sure the INVALOK flag is cleared to prevent
1439                          * the possibly-dirty bp from being invalidated now
1440                          * that we are using it as part of a de-dup operation.
1441                          */
1442                         if (hammer2_debug & 0x40000) {
1443                                 kprintf("DEDUP SUCCESS %016jx\n",
1444                                         (intmax_t)off);
1445                         }
1446                         atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALOK);
1447                         hammer2_io_putblk(&dio);
1448                         *datap = NULL;
1449                         dedup[i].ticks = ticks; /* update use */
1450                         ++hammer2_iod_file_wdedup;
1451
1452                         return off;             /* RETURN */
1453                 }
1454                 if (dio)
1455                         hammer2_io_putblk(&dio);
1456         }
1457         return 0;
1458 }
1459
1460 /*
1461  * Poof.  Races are ok, if someone gets in and reuses a dedup offset
1462  * before or while we are clearing it they will also recover the freemap
1463  * entry (set it to fully allocated), so a bulkfree race can only set it
1464  * to a possibly-free state.
1465  *
1466  * XXX ok, well, not really sure races are ok but going to run with it
1467  *     for the moment.
1468  */
1469 void
1470 hammer2_dedup_clear(hammer2_dev_t *hmp)
1471 {
1472         int i;
1473
1474         for (i = 0; i < HAMMER2_DEDUP_HEUR_SIZE; ++i) {
1475                 hmp->heur_dedup[i].data_off = 0;
1476                 hmp->heur_dedup[i].ticks = ticks - 1;
1477         }
1478 }