HAMMER 56A/Many: Performance tuning - MEDIA STRUCTURES CHANGED!
[dragonfly.git] / sys / vfs / hammer / hammer_flusher.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_flusher.c,v 1.28 2008/06/17 04:02:38 dillon Exp $
35  */
36 /*
37  * HAMMER dependancy flusher thread
38  *
39  * Meta data updates create buffer dependancies which are arranged as a
40  * hierarchy of lists.
41  */
42
43 #include "hammer.h"
44
45 static void hammer_flusher_master_thread(void *arg);
46 static void hammer_flusher_slave_thread(void *arg);
47 static void hammer_flusher_clean_loose_ios(hammer_mount_t hmp);
48 static void hammer_flusher_flush(hammer_mount_t hmp);
49 static void hammer_flusher_flush_inode(hammer_inode_t ip,
50                                         hammer_transaction_t trans);
51 static int hammer_must_finalize_undo(hammer_mount_t hmp);
52 static void hammer_flusher_finalize(hammer_transaction_t trans, int final);
53
54 /*
55  * Support structures for the flusher threads.
56  */
57 struct hammer_flusher_info {
58         struct hammer_mount *hmp;
59         thread_t        td;
60         int             startit;
61         hammer_inode_t  work_array[HAMMER_FLUSH_GROUP_SIZE];
62 };
63
64 typedef struct hammer_flusher_info *hammer_flusher_info_t;
65
66 /*
67  * Sync all inodes pending on the flusher.  This routine may have to be
68  * called twice to get them all as some may be queued to a later flush group.
69  */
70 void
71 hammer_flusher_sync(hammer_mount_t hmp)
72 {
73         int seq;
74
75         if (hmp->flusher.td) {
76                 seq = hmp->flusher.next;
77                 if (hmp->flusher.signal++ == 0)
78                         wakeup(&hmp->flusher.signal);
79                 while ((int)(seq - hmp->flusher.done) > 0)
80                         tsleep(&hmp->flusher.done, 0, "hmrfls", 0);
81         }
82 }
83
84 /*
85  * Sync all inodes pending on the flusher - return immediately.
86  */
87 void
88 hammer_flusher_async(hammer_mount_t hmp)
89 {
90         if (hmp->flusher.td) {
91                 if (hmp->flusher.signal++ == 0)
92                         wakeup(&hmp->flusher.signal);
93         }
94 }
95
96 void
97 hammer_flusher_create(hammer_mount_t hmp)
98 {
99         hammer_flusher_info_t info;
100         int i;
101
102         hmp->flusher.signal = 0;
103         hmp->flusher.act = 0;
104         hmp->flusher.done = 0;
105         hmp->flusher.next = 1;
106         hmp->flusher.count = 0;
107         hammer_ref(&hmp->flusher.finalize_lock);
108
109         lwkt_create(hammer_flusher_master_thread, hmp,
110                     &hmp->flusher.td, NULL, 0, -1, "hammer-M");
111         for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
112                 info = kmalloc(sizeof(*info), M_HAMMER, M_WAITOK|M_ZERO);
113                 info->hmp = hmp;
114                 ++hmp->flusher.count;
115                 hmp->flusher.info[i] = info;
116                 lwkt_create(hammer_flusher_slave_thread, info,
117                             &info->td, NULL, 0, -1, "hammer-S%d", i);
118         }
119 }
120
121 void
122 hammer_flusher_destroy(hammer_mount_t hmp)
123 {
124         hammer_flusher_info_t info;
125         int i;
126
127         /*
128          * Kill the master
129          */
130         hmp->flusher.exiting = 1;
131         while (hmp->flusher.td) {
132                 ++hmp->flusher.signal;
133                 wakeup(&hmp->flusher.signal);
134                 tsleep(&hmp->flusher.exiting, 0, "hmrwex", hz);
135         }
136
137         /*
138          * Kill the slaves
139          */
140         for (i = 0; i < HAMMER_MAX_FLUSHERS; ++i) {
141                 if ((info = hmp->flusher.info[i]) != NULL) {
142                         KKASSERT(info->startit == 0);
143                         info->startit = -1;
144                         wakeup(&info->startit);
145                         while (info->td) {
146                                 tsleep(&info->td, 0, "hmrwwc", 0);
147                         }
148                         hmp->flusher.info[i] = NULL;
149                         kfree(info, M_HAMMER);
150                         --hmp->flusher.count;
151                 }
152         }
153         KKASSERT(hmp->flusher.count == 0);
154 }
155
156 /*
157  * The master flusher thread manages the flusher sequence id and
158  * synchronization with the slave work threads.
159  */
160 static void
161 hammer_flusher_master_thread(void *arg)
162 {
163         hammer_mount_t hmp = arg;
164
165         for (;;) {
166                 while (hmp->flusher.group_lock)
167                         tsleep(&hmp->flusher.group_lock, 0, "hmrhld", 0);
168                 kprintf("S");
169                 hmp->flusher.act = hmp->flusher.next;
170                 ++hmp->flusher.next;
171                 hammer_flusher_clean_loose_ios(hmp);
172                 hammer_flusher_flush(hmp);
173                 hmp->flusher.done = hmp->flusher.act;
174                 wakeup(&hmp->flusher.done);
175
176                 /*
177                  * Wait for activity.
178                  */
179                 if (hmp->flusher.exiting && TAILQ_EMPTY(&hmp->flush_list))
180                         break;
181
182                 /*
183                  * This is a hack until we can dispose of frontend buffer
184                  * cache buffers on the frontend.
185                  */
186                 while (hmp->flusher.signal == 0)
187                         tsleep(&hmp->flusher.signal, 0, "hmrwwa", 0);
188                 hmp->flusher.signal = 0;
189         }
190
191         /*
192          * And we are done.
193          */
194         hmp->flusher.td = NULL;
195         wakeup(&hmp->flusher.exiting);
196         lwkt_exit();
197 }
198
199 /*
200  * The slave flusher thread pulls work off the master flush_list until no
201  * work is left.
202  */
203 static void
204 hammer_flusher_slave_thread(void *arg)
205 {
206         hammer_flusher_info_t info;
207         hammer_mount_t hmp;
208         hammer_inode_t ip;
209         int c;
210         int i;
211         int n;
212
213         info = arg;
214         hmp = info->hmp;
215
216         for (;;) {
217                 while (info->startit == 0)
218                         tsleep(&info->startit, 0, "hmrssw", 0);
219                 if (info->startit < 0)
220                         break;
221                 info->startit = 0;
222
223                 /*
224                  * Try to pull out around ~64 inodes at a time to flush.
225                  * The idea is to try to avoid deadlocks between the slaves.
226                  */
227                 n = c = 0;
228                 while ((ip = TAILQ_FIRST(&hmp->flush_list)) != NULL) {
229                         if (ip->flush_group != hmp->flusher.act)
230                                 break;
231                         TAILQ_REMOVE(&hmp->flush_list, ip, flush_entry);
232                         info->work_array[n++] = ip;
233                         c += ip->rsv_recs;
234                         if (n < HAMMER_FLUSH_GROUP_SIZE &&
235                             c < HAMMER_FLUSH_GROUP_SIZE * 8) {
236                                 continue;
237                         }
238                         for (i = 0; i < n; ++i){
239                                 hammer_flusher_flush_inode(info->work_array[i],
240                                                         &hmp->flusher.trans);
241                         }
242                         n = c = 0;
243                 }
244                 for (i = 0; i < n; ++i) {
245                         hammer_flusher_flush_inode(info->work_array[i],
246                                                    &hmp->flusher.trans);
247                 }
248                 if (--hmp->flusher.running == 0)
249                         wakeup(&hmp->flusher.running);
250         }
251         info->td = NULL;
252         wakeup(&info->td);
253         lwkt_exit();
254 }
255
256 static void
257 hammer_flusher_clean_loose_ios(hammer_mount_t hmp)
258 {
259         hammer_buffer_t buffer;
260         hammer_io_t io;
261         int panic_count = 1000000;
262
263         /*
264          * loose ends - buffers without bp's aren't tracked by the kernel
265          * and can build up, so clean them out.  This can occur when an
266          * IO completes on a buffer with no references left.
267          */
268         crit_enter();   /* biodone() race */
269         while ((io = TAILQ_FIRST(&hmp->lose_list)) != NULL) {
270                 KKASSERT(--panic_count > 0);
271                 KKASSERT(io->mod_list == &hmp->lose_list);
272                 TAILQ_REMOVE(&hmp->lose_list, io, mod_entry);
273                 io->mod_list = NULL;
274                 if (io->lock.refs == 0)
275                         ++hammer_count_refedbufs;
276                 hammer_ref(&io->lock);
277                 buffer = (void *)io;
278                 hammer_rel_buffer(buffer, 0);
279         }
280         crit_exit();
281 }
282
283 /*
284  * Flush all inodes in the current flush group.
285  */
286 static void
287 hammer_flusher_flush(hammer_mount_t hmp)
288 {
289         hammer_flusher_info_t info;
290         hammer_reserve_t resv;
291         int i;
292         int n;
293
294         hammer_start_transaction_fls(&hmp->flusher.trans, hmp);
295
296         /*
297          * Start work threads.
298          */
299         i = 0;
300         n = hmp->count_iqueued / HAMMER_FLUSH_GROUP_SIZE;
301         if (TAILQ_FIRST(&hmp->flush_list)) {
302                 for (i = 0; i <= n; ++i) {
303                         if (i == HAMMER_MAX_FLUSHERS ||
304                             hmp->flusher.info[i] == NULL) {
305                                 break;
306                         }
307                         info = hmp->flusher.info[i];
308                         if (info->startit == 0) {
309                                 ++hmp->flusher.running;
310                                 info->startit = 1;
311                                 wakeup(&info->startit);
312                         }
313                 }
314         }
315         while (hmp->flusher.running)
316                 tsleep(&hmp->flusher.running, 0, "hmrfcc", 0);
317
318         hammer_flusher_finalize(&hmp->flusher.trans, 1);
319         hmp->flusher.tid = hmp->flusher.trans.tid;
320
321         /*
322          * Clean up any freed big-blocks (typically zone-2). 
323          * resv->flush_group is typically set several flush groups ahead
324          * of the free to ensure that the freed block is not reused until
325          * it can no longer be reused.
326          */
327         while ((resv = TAILQ_FIRST(&hmp->delay_list)) != NULL) {
328                 if (resv->flush_group != hmp->flusher.act)
329                         break;
330                 hammer_reserve_clrdelay(hmp, resv);
331         }
332         hammer_done_transaction(&hmp->flusher.trans);
333 }
334
335 /*
336  * Flush a single inode that is part of a flush group.
337  */
338 static
339 void
340 hammer_flusher_flush_inode(hammer_inode_t ip, hammer_transaction_t trans)
341 {
342         hammer_mount_t hmp = ip->hmp;
343
344         hammer_lock_sh(&hmp->flusher.finalize_lock);
345         ip->error = hammer_sync_inode(ip);
346         hammer_flush_inode_done(ip);
347         hammer_unlock(&hmp->flusher.finalize_lock);
348         while (hmp->flusher.finalize_want)
349                 tsleep(&hmp->flusher.finalize_want, 0, "hmrsxx", 0);
350         if (hammer_must_finalize_undo(hmp)) {
351                 hmp->flusher.finalize_want = 1;
352                 hammer_lock_ex(&hmp->flusher.finalize_lock);
353                 kprintf("HAMMER: Warning: UNDO area too small!");
354                 hammer_flusher_finalize(trans, 1);
355                 hammer_unlock(&hmp->flusher.finalize_lock);
356                 hmp->flusher.finalize_want = 0;
357                 wakeup(&hmp->flusher.finalize_want);
358         } else if (trans->hmp->locked_dirty_count +
359                    trans->hmp->io_running_count > hammer_limit_dirtybufs) {
360                 hmp->flusher.finalize_want = 1;
361                 hammer_lock_ex(&hmp->flusher.finalize_lock);
362                 kprintf("t");
363                 hammer_flusher_finalize(trans, 0);
364                 hammer_unlock(&hmp->flusher.finalize_lock);
365                 hmp->flusher.finalize_want = 0;
366                 wakeup(&hmp->flusher.finalize_want);
367         }
368 }
369
370 /*
371  * If the UNDO area gets over half full we have to flush it.  We can't
372  * afford the UNDO area becoming completely full as that would break
373  * the crash recovery atomicy.
374  */
375 static
376 int
377 hammer_must_finalize_undo(hammer_mount_t hmp)
378 {
379         if (hammer_undo_space(hmp) < hammer_undo_max(hmp) / 2) {
380                 hkprintf("*");
381                 return(1);
382         } else {
383                 return(0);
384         }
385 }
386
387 /*
388  * Flush all pending UNDOs, wait for write completion, update the volume
389  * header with the new UNDO end position, and flush it.  Then
390  * asynchronously flush the meta-data.
391  *
392  * If this is the last finalization in a flush group we also synchronize
393  * our cached blockmap and set hmp->flusher_undo_start and our cached undo
394  * fifo first_offset so the next flush resets the FIFO pointers.
395  */
396 static
397 void
398 hammer_flusher_finalize(hammer_transaction_t trans, int final)
399 {
400         hammer_volume_t root_volume;
401         hammer_blockmap_t cundomap, dundomap;
402         hammer_mount_t hmp;
403         hammer_io_t io;
404         int count;
405         int i;
406
407         hmp = trans->hmp;
408         root_volume = trans->rootvol;
409
410         /*
411          * Flush data buffers.  This can occur asynchronously and at any
412          * time.  We must interlock against the frontend direct-data write
413          * but do not have to acquire the sync-lock yet.
414          */
415         count = 0;
416         while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
417                 if (io->lock.refs == 0)
418                         ++hammer_count_refedbufs;
419                 hammer_ref(&io->lock);
420                 hammer_io_write_interlock(io);
421                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
422                 hammer_io_flush(io);
423                 hammer_io_done_interlock(io);
424                 hammer_rel_buffer((hammer_buffer_t)io, 0);
425                 ++count;
426         }
427
428         /*
429          * The sync-lock is required for the remaining sequence.  This lock
430          * prevents meta-data from being modified.
431          */
432         hammer_sync_lock_ex(trans);
433
434         /*
435          * If we have been asked to finalize the volume header sync the
436          * cached blockmap to the on-disk blockmap.  Generate an UNDO
437          * record for the update.
438          */
439         if (final) {
440                 cundomap = &hmp->blockmap[0];
441                 dundomap = &root_volume->ondisk->vol0_blockmap[0];
442                 if (root_volume->io.modified) {
443                         hammer_modify_volume(trans, root_volume,
444                                              dundomap, sizeof(hmp->blockmap));
445                         for (i = 0; i < HAMMER_MAX_ZONES; ++i)
446                                 hammer_crc_set_blockmap(&cundomap[i]);
447                         bcopy(cundomap, dundomap, sizeof(hmp->blockmap));
448                         hammer_modify_volume_done(root_volume);
449                 }
450         }
451
452         /*
453          * Flush UNDOs
454          */
455         count = 0;
456         while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
457                 KKASSERT(io->modify_refs == 0);
458                 if (io->lock.refs == 0)
459                         ++hammer_count_refedbufs;
460                 hammer_ref(&io->lock);
461                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
462                 hammer_io_flush(io);
463                 hammer_rel_buffer((hammer_buffer_t)io, 0);
464                 ++count;
465         }
466
467         /*
468          * Wait for I/Os to complete
469          */
470         hammer_flusher_clean_loose_ios(hmp);
471         hammer_io_wait_all(hmp, "hmrfl1");
472
473         /*
474          * Update the on-disk volume header with new UNDO FIFO end position
475          * (do not generate new UNDO records for this change).  We have to
476          * do this for the UNDO FIFO whether (final) is set or not.
477          *
478          * Also update the on-disk next_tid field.  This does not require
479          * an UNDO.  However, because our TID is generated before we get
480          * the sync lock another sync may have beat us to the punch.
481          *
482          * The volume header will be flushed out synchronously.
483          */
484         dundomap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
485         cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
486
487         if (dundomap->first_offset != cundomap->first_offset ||
488             dundomap->next_offset != cundomap->next_offset) {
489                 hammer_modify_volume(NULL, root_volume, NULL, 0);
490                 dundomap->first_offset = cundomap->first_offset;
491                 dundomap->next_offset = cundomap->next_offset;
492                 hammer_crc_set_blockmap(dundomap);
493                 hammer_crc_set_volume(root_volume->ondisk);
494                 if (root_volume->ondisk->vol0_next_tid < trans->tid)
495                         root_volume->ondisk->vol0_next_tid = trans->tid;
496                 hammer_modify_volume_done(root_volume);
497         }
498
499         if (root_volume->io.modified) {
500                 hammer_io_flush(&root_volume->io);
501         }
502
503         /*
504          * Wait for I/Os to complete
505          */
506         hammer_flusher_clean_loose_ios(hmp);
507         hammer_io_wait_all(hmp, "hmrfl2");
508
509         /*
510          * Flush meta-data.  The meta-data will be undone if we crash
511          * so we can safely flush it asynchronously.
512          *
513          * Repeated catchups will wind up flushing this update's meta-data
514          * and the UNDO buffers for the next update simultaniously.  This
515          * is ok.
516          */
517         count = 0;
518         while ((io = TAILQ_FIRST(&hmp->meta_list)) != NULL) {
519                 KKASSERT(io->modify_refs == 0);
520                 if (io->lock.refs == 0)
521                         ++hammer_count_refedbufs;
522                 hammer_ref(&io->lock);
523                 KKASSERT(io->type != HAMMER_STRUCTURE_VOLUME);
524                 hammer_io_flush(io);
525                 hammer_rel_buffer((hammer_buffer_t)io, 0);
526                 ++count;
527         }
528
529         /*
530          * If this is the final finalization for the flush group set
531          * up for the next sequence by setting a new first_offset in
532          * our cached blockmap and
533          * clearing the undo history.
534          */
535         if (final) {
536                 cundomap = &hmp->blockmap[HAMMER_ZONE_UNDO_INDEX];
537                 cundomap->first_offset = cundomap->next_offset;
538                 hammer_clear_undo_history(hmp);
539         }
540
541         hammer_sync_unlock(trans);
542 }
543