2 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_ccms.c,v 1.4 2007/04/30 07:18:53 dillon Exp $
37 * The Cache Coherency Management System (CCMS)
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/malloc.h>
44 #include <sys/objcache.h>
46 #include <sys/sysctl.h>
48 #include <machine/limits.h>
50 struct ccms_lock_scan_info {
58 static int ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2);
59 static int ccms_lock_scan_cmp(ccms_cst_t b1, void *arg);
60 static int ccms_lock_undo_cmp(ccms_cst_t b1, void *arg);
61 static int ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg);
62 static int ccms_lock_get_match(struct ccms_cst *cst, void *arg);
63 static int ccms_lock_undo_match(struct ccms_cst *cst, void *arg);
64 static int ccms_lock_redo_match(struct ccms_cst *cst, void *arg);
65 static int ccms_lock_put_match(struct ccms_cst *cst, void *arg);
67 RB_GENERATE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp,
68 off_t, beg_offset, end_offset);
69 static MALLOC_DEFINE(M_CCMS, "CCMS", "Cache Coherency Management System");
71 static int ccms_enable;
72 SYSCTL_INT(_kern, OID_AUTO, ccms_enable, CTLFLAG_RW, &ccms_enable, 0, "");
74 static struct objcache *ccms_oc;
77 * Initialize the CCMS subsystem
82 ccms_oc = objcache_create_simple(M_CCMS, sizeof(struct ccms_cst));
84 SYSINIT(ccms, SI_BOOT2_MACHDEP, SI_ORDER_ANY, ccmsinit, NULL);
87 * Initialize a new CCMS dataspace. Create a new RB tree with a single
88 * element covering the entire 64 bit offset range. This simplifies
89 * algorithms enormously by removing a number of special cases.
92 ccms_dataspace_init(ccms_dataspace_t ds)
99 cst = objcache_get(ccms_oc, M_WAITOK);
100 bzero(cst, sizeof(*cst));
101 cst->beg_offset = LLONG_MIN;
102 cst->end_offset = LLONG_MAX;
103 cst->state = CCMS_STATE_INVALID;
104 RB_INSERT(ccms_rb_tree, &ds->tree, cst);
105 spin_init(&ds->spin);
109 * Helper to destroy deleted cst's.
113 ccms_delayed_free(ccms_cst_t cstn)
117 while((cst = cstn) != NULL) {
118 cstn = cst->delayed_next;
119 objcache_put(ccms_oc, cst);
124 * Destroy a CCMS dataspace.
129 ccms_dataspace_destroy(ccms_dataspace_t ds)
133 spin_lock_wr(&ds->spin);
134 RB_SCAN(ccms_rb_tree, &ds->tree, NULL,
135 ccms_dataspace_destroy_match, ds);
136 cst = ds->delayed_free;
137 ds->delayed_free = NULL;
138 spin_unlock_wr(&ds->spin);
139 ccms_delayed_free(cst);
143 * Helper routine to delete matches during a destroy.
145 * NOTE: called with spinlock held.
149 ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg)
151 ccms_dataspace_t ds = arg;
153 RB_REMOVE(ccms_rb_tree, &ds->tree, cst);
154 cst->delayed_next = ds->delayed_free;
155 ds->delayed_free = cst;
165 ccms_lock_get(ccms_dataspace_t ds, ccms_lock_t lock)
167 struct ccms_lock_scan_info info;
170 if (ccms_enable == 0) {
176 * Partition the CST space so the precise range is covered and
177 * attempt to obtain the requested local lock (ltype) at the same
183 info.coll_cst = NULL;
184 info.cst1 = objcache_get(ccms_oc, M_WAITOK);
185 info.cst2 = objcache_get(ccms_oc, M_WAITOK);
187 spin_lock_wr(&ds->spin);
188 RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
189 ccms_lock_get_match, &info);
192 * If a collision occured, undo the fragments we were able to obtain,
193 * block, and try again.
195 while (info.coll_cst != NULL) {
196 RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_undo_cmp,
197 ccms_lock_undo_match, &info);
198 info.coll_cst->blocked = 1;
199 msleep(info.coll_cst, &ds->spin, 0,
200 ((lock->ltype == CCMS_LTYPE_SHARED) ? "rngsh" : "rngex"),
202 info.coll_cst = NULL;
203 RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
204 ccms_lock_redo_match, &info);
206 cst = ds->delayed_free;
207 ds->delayed_free = NULL;
208 spin_unlock_wr(&ds->spin);
213 ccms_delayed_free(cst);
215 objcache_put(ccms_oc, info.cst1);
217 objcache_put(ccms_oc, info.cst2);
223 * Obtain a CCMS lock, initialize the lock structure from the uio.
228 ccms_lock_get_uio(ccms_dataspace_t ds, ccms_lock_t lock, struct uio *uio)
233 if (uio->uio_rw == UIO_READ)
234 ltype = CCMS_LTYPE_SHARED;
236 ltype = CCMS_LTYPE_MODIFYING;
239 * Calculate the ending offset (byte inclusive), make sure a seek
240 * overflow does not blow us up.
242 eoff = uio->uio_offset + uio->uio_resid - 1;
243 if (eoff < uio->uio_offset)
244 eoff = 0x7FFFFFFFFFFFFFFFLL;
245 ccms_lock_init(lock, uio->uio_offset, eoff, ltype);
246 return(ccms_lock_get(ds, lock));
252 * NOTE: called with spinlock held.
256 ccms_lock_get_match(ccms_cst_t cst, void *arg)
258 struct ccms_lock_scan_info *info = arg;
259 ccms_lock_t lock = info->lock;
263 * If the lock's left edge is within the CST we must split the CST
264 * into two pieces [cst][ncst]. lrefs must be bumped on the CST
265 * containing the left edge.
267 * NOTE! cst->beg_offset may not be modified. This allows us to avoid
268 * having to manipulate the cst's position in the tree.
270 if (lock->beg_offset > cst->beg_offset) {
273 KKASSERT(ncst != NULL);
275 cst->end_offset = lock->beg_offset - 1;
277 ncst->beg_offset = lock->beg_offset;
279 RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst);
282 * ncst becomes our 'matching' cst.
285 } else if (lock->beg_offset == cst->beg_offset) {
290 * If the lock's right edge is within the CST we must split the CST
291 * into two pieces [cst][ncst]. rrefs must be bumped on the CST
292 * containing the right edge.
294 * NOTE! cst->beg_offset may not be modified. This allows us to avoid
295 * having to manipulate the cst's position in the tree.
297 if (lock->end_offset < cst->end_offset) {
300 KKASSERT(ncst != NULL);
302 cst->end_offset = lock->end_offset;
304 ncst->beg_offset = lock->end_offset + 1;
306 RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst);
307 /* cst remains our 'matching' cst */
308 } else if (lock->end_offset == cst->end_offset) {
313 * The lock covers the CST, so increment the CST's coverage count.
314 * Then attempt to obtain the shared/exclusive ltype.
318 if (info->coll_cst == NULL) {
319 switch(lock->ltype) {
320 case CCMS_LTYPE_SHARED:
321 if (cst->sharecount < 0) {
322 info->coll_cst = cst;
325 if (ccms_enable >= 9) {
326 kprintf("CST SHARE %d %lld-%lld\n",
328 (long long)cst->beg_offset,
329 (long long)cst->end_offset);
333 case CCMS_LTYPE_EXCLUSIVE:
334 if (cst->sharecount != 0) {
335 info->coll_cst = cst;
338 if (ccms_enable >= 9) {
339 kprintf("CST EXCLS %d %lld-%lld\n",
341 (long long)cst->beg_offset,
342 (long long)cst->end_offset);
346 case CCMS_LTYPE_MODIFYING:
347 if (cst->sharecount != 0) {
348 info->coll_cst = cst;
352 if (ccms_enable >= 9) {
353 kprintf("CST MODXL %d %lld-%lld\n",
355 (long long)cst->beg_offset,
356 (long long)cst->end_offset);
366 * Undo a partially resolved ccms_ltype rangelock. This is atomic with
367 * the scan/redo code so there should not be any blocked locks when
368 * transitioning to 0.
370 * NOTE: called with spinlock held.
374 ccms_lock_undo_match(ccms_cst_t cst, void *arg)
376 struct ccms_lock_scan_info *info = arg;
377 ccms_lock_t lock = info->lock;
379 switch(lock->ltype) {
380 case CCMS_LTYPE_SHARED:
381 KKASSERT(cst->sharecount > 0);
383 KKASSERT(cst->sharecount || cst->blocked == 0);
385 case CCMS_LTYPE_EXCLUSIVE:
386 KKASSERT(cst->sharecount < 0);
388 KKASSERT(cst->sharecount || cst->blocked == 0);
390 case CCMS_LTYPE_MODIFYING:
391 KKASSERT(cst->sharecount < 0 && cst->modifycount > 0);
394 KKASSERT(cst->sharecount || cst->blocked == 0);
401 * Redo the local lock request for a range which has already been
404 * NOTE: called with spinlock held.
408 ccms_lock_redo_match(ccms_cst_t cst, void *arg)
410 struct ccms_lock_scan_info *info = arg;
411 ccms_lock_t lock = info->lock;
413 if (info->coll_cst == NULL) {
414 switch(lock->ltype) {
415 case CCMS_LTYPE_SHARED:
416 if (cst->sharecount < 0) {
417 info->coll_cst = cst;
419 if (ccms_enable >= 9) {
420 kprintf("CST SHARE %d %lld-%lld\n",
422 (long long)cst->beg_offset,
423 (long long)cst->end_offset);
428 case CCMS_LTYPE_EXCLUSIVE:
429 if (cst->sharecount != 0) {
430 info->coll_cst = cst;
433 if (ccms_enable >= 9) {
434 kprintf("CST EXCLS %d %lld-%lld\n",
436 (long long)cst->beg_offset,
437 (long long)cst->end_offset);
441 case CCMS_LTYPE_MODIFYING:
442 if (cst->sharecount != 0) {
443 info->coll_cst = cst;
447 if (ccms_enable >= 9) {
448 kprintf("CST MODXL %d %lld-%lld\n",
450 (long long)cst->beg_offset,
451 (long long)cst->end_offset);
461 * Release a CCMS lock
466 ccms_lock_put(ccms_dataspace_t ds, ccms_lock_t lock)
468 struct ccms_lock_scan_info info;
471 if (lock->ds == NULL)
480 spin_lock_wr(&ds->spin);
481 RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
482 ccms_lock_put_match, &info);
483 cst = ds->delayed_free;
484 ds->delayed_free = NULL;
485 spin_unlock_wr(&ds->spin);
487 ccms_delayed_free(cst);
489 objcache_put(ccms_oc, info.cst1);
491 objcache_put(ccms_oc, info.cst2);
496 * NOTE: called with spinlock held.
500 ccms_lock_put_match(ccms_cst_t cst, void *arg)
502 struct ccms_lock_scan_info *info = arg;
503 ccms_lock_t lock = info->lock;
507 * Undo the local shared/exclusive rangelock.
509 switch(lock->ltype) {
510 case CCMS_LTYPE_SHARED:
511 KKASSERT(cst->sharecount > 0);
513 if (ccms_enable >= 9) {
514 kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->sharecount,
515 (long long)cst->beg_offset,
516 (long long)cst->end_offset,
519 if (cst->blocked && cst->sharecount == 0) {
524 case CCMS_LTYPE_EXCLUSIVE:
525 KKASSERT(cst->sharecount < 0);
527 if (ccms_enable >= 9) {
528 kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->sharecount,
529 (long long)cst->beg_offset,
530 (long long)cst->end_offset,
533 if (cst->blocked && cst->sharecount == 0) {
538 case CCMS_LTYPE_MODIFYING:
539 KKASSERT(cst->sharecount < 0 && cst->modifycount > 0);
542 if (ccms_enable >= 9) {
543 kprintf("CST UNMOD %d %lld-%lld (%d)\n", cst->sharecount,
544 (long long)cst->beg_offset,
545 (long long)cst->end_offset,
548 if (cst->blocked && cst->sharecount == 0) {
556 * Decrement the lock coverage count on the CST. Decrement the left and
557 * right edge counts as appropriate.
559 * When lrefs or rrefs drops to zero we check the adjacent entry to
560 * determine whether a merge is possible. If the appropriate refs field
561 * (rrefs for the entry to our left, lrefs for the entry to our right)
562 * is 0, then all covering locks must cover both entries and the xrefs
563 * field must match. We can then merge the entries if they have
564 * compatible cache states.
566 * However, because we are cleaning up the shared/exclusive count at
567 * the same time, the sharecount field may be temporarily out of
568 * sync, so require that the sharecount field also match before doing
571 * When merging an element which is being blocked on, the blocking
572 * thread(s) will be woken up.
574 * If the dataspace has too many CSTs we may be able to merge the
575 * entries even if their cache states are not the same, by dropping
576 * both to a compatible (lower) cache state and performing the appropriate
577 * management operations. XXX
580 if (lock->beg_offset == cst->beg_offset) {
582 if (cst->lrefs == 0) {
583 if ((ocst = RB_PREV(ccms_rb_tree, &info->ds->tree, cst)) != NULL &&
585 ocst->state == cst->state &&
586 ocst->sharecount == cst->sharecount
588 KKASSERT(ocst->xrefs == cst->xrefs);
589 KKASSERT(ocst->end_offset + 1 == cst->beg_offset);
590 RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst);
591 cst->beg_offset = ocst->beg_offset;
592 cst->lrefs = ocst->lrefs;
593 if (ccms_enable >= 9) {
594 kprintf("MERGELEFT %p %lld-%lld (%d)\n",
596 (long long)cst->beg_offset,
597 (long long)cst->end_offset,
604 /*objcache_put(ccms_oc, ocst);*/
605 ocst->delayed_next = info->ds->delayed_free;
606 info->ds->delayed_free = ocst;
610 if (lock->end_offset == cst->end_offset) {
612 if (cst->rrefs == 0) {
613 if ((ocst = RB_NEXT(ccms_rb_tree, &info->ds->tree, cst)) != NULL &&
615 ocst->state == cst->state &&
616 ocst->sharecount == cst->sharecount
618 KKASSERT(ocst->xrefs == cst->xrefs);
619 KKASSERT(cst->end_offset + 1 == ocst->beg_offset);
620 RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst);
621 cst->end_offset = ocst->end_offset;
622 cst->rrefs = ocst->rrefs;
623 if (ccms_enable >= 9) {
624 kprintf("MERGERIGHT %p %lld-%lld\n",
626 (long long)cst->beg_offset,
627 (long long)cst->end_offset);
629 /*objcache_put(ccms_oc, ocst);*/
630 ocst->delayed_next = info->ds->delayed_free;
631 info->ds->delayed_free = ocst;
639 * RB tree compare function for insertions and deletions. This function
643 ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2)
645 if (b1->end_offset < b2->beg_offset)
647 if (b1->beg_offset > b2->end_offset)
653 * RB tree scanning compare function. This function compares the CST
654 * from the tree against the supplied ccms_lock and returns the CST's
655 * placement relative to the lock.
658 ccms_lock_scan_cmp(ccms_cst_t cst, void *arg)
660 struct ccms_lock_scan_info *info = arg;
661 ccms_lock_t lock = info->lock;
663 if (cst->end_offset < lock->beg_offset)
665 if (cst->beg_offset > lock->end_offset)
671 * This function works like ccms_lock_scan_cmp but terminates at the
672 * collision point rather then at the lock's ending offset. Only
673 * the CSTs that were already partially resolved are returned by the scan.
676 ccms_lock_undo_cmp(ccms_cst_t cst, void *arg)
678 struct ccms_lock_scan_info *info = arg;
679 ccms_lock_t lock = info->lock;
681 if (cst->end_offset < lock->beg_offset)
683 if (cst->beg_offset >= info->coll_cst->beg_offset)