1 /* $NetBSD: dev-io.c,v 1.6 2009/12/02 01:53:25 haad Exp $ */
4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
7 * This file is part of LVM2.
9 * This copyrighted material is made available to anyone wishing to use,
10 * modify, copy, or redistribute it subject to the terms and conditions
11 * of the GNU Lesser General Public License v.2.1.
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this program; if not, write to the Free Software Foundation,
15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include "lvm-types.h"
30 #include <sys/ioctl.h>
33 # define u64 uint64_t /* Missing without __KERNEL__ */
34 # undef WNOHANG /* Avoid redefinition */
35 # undef WUNTRACED /* Avoid redefinition */
36 # include <linux/fs.h> /* For block ioctl definitions */
37 # define BLKSIZE_SHIFT SECTOR_SHIFT
38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */
39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 # endif /* BLKGETSIZE64 */
42 # include <sys/disk.h>
43 # include <sys/disklabel.h>
44 # include <sys/param.h>
46 # include <sys/diskslice.h>
47 # include <sys/param.h>
49 # include <sys/disk.h>
50 # define BLKBSZGET DKIOCGETBLOCKSIZE
51 # define BLKSSZGET DKIOCGETBLOCKSIZE
52 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
53 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE
54 # define BLKSIZE_SHIFT 0
57 #ifdef O_DIRECT_SUPPORT
59 # error O_DIRECT support configured but O_DIRECT definition not found in headers
63 static DM_LIST_INIT(_open_devices);
65 /*-----------------------------------------------------------------
66 * The standard io loop that keeps submitting an io until it's
68 *---------------------------------------------------------------*/
69 static int _io(struct device_area *where, void *buffer, int should_write)
71 int fd = dev_fd(where->dev);
76 log_error("Attempt to read an unopened device (%s).",
77 dev_name(where->dev));
82 * Skip all writes in test mode.
84 if (should_write && test_mode())
87 if (where->size > SSIZE_MAX) {
88 log_error("Read size too large: %" PRIu64, where->size);
92 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) {
93 log_error("%s: lseek %" PRIu64 " failed: %s",
94 dev_name(where->dev), (uint64_t) where->start,
99 while (total < (size_t) where->size) {
102 write(fd, buffer, (size_t) where->size - total) :
103 read(fd, buffer, (size_t) where->size - total);
104 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
107 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64
108 " at %" PRIu64 ": %s", dev_name(where->dev),
109 should_write ? "write" : "read",
111 (uint64_t) where->size,
112 (uint64_t) where->start, strerror(errno));
121 return (total == (size_t) where->size);
124 /*-----------------------------------------------------------------
125 * LVM2 uses O_DIRECT when performing metadata io, which requires
126 * block size aligned accesses. If any io is not aligned we have
127 * to perform the io via a bounce buffer, obviously this is quite
129 *---------------------------------------------------------------*/
132 * Get the sector size from an _open_ device.
134 static int _get_block_size(struct device *dev, unsigned int *size)
136 const char *name = dev_name(dev);
138 struct disklabel lab;
140 struct partinfo pinfo;
143 if ((dev->block_size == -1)) {
145 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) {
146 dev->block_size = DEV_BSIZE;
148 dev->block_size = lab.d_secsize;
150 if (ioctl(dev_fd(dev), DIOCGPART, &pinfo) < 0) {
151 dev->block_size = DEV_BSIZE;
153 dev->block_size = pinfo.media_blksize;
155 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
156 log_sys_error("ioctl BLKBSZGET", name);
160 log_debug("%s: block size is %u bytes", name, dev->block_size);
163 *size = (unsigned int) dev->block_size;
169 * Widens a region to be an aligned region.
171 static void _widen_region(unsigned int block_size, struct device_area *region,
172 struct device_area *result)
174 uint64_t mask = block_size - 1, delta;
175 memcpy(result, region, sizeof(*result));
177 /* adjust the start */
178 delta = result->start & mask;
180 result->start -= delta;
181 result->size += delta;
185 delta = (result->start + result->size) & mask;
187 result->size += block_size - delta;
190 static int _aligned_io(struct device_area *where, void *buffer,
194 unsigned int block_size = 0;
196 struct device_area widened;
198 if (!(where->dev->flags & DEV_REGULAR) &&
199 !_get_block_size(where->dev, &block_size))
203 block_size = lvm_getpagesize();
205 _widen_region(block_size, where, &widened);
207 /* Do we need to use a bounce buffer? */
208 mask = block_size - 1;
209 if (!memcmp(where, &widened, sizeof(widened)) &&
210 !((uintptr_t) buffer & mask))
211 return _io(where, buffer, should_write);
213 /* Allocate a bounce buffer with an extra block */
214 if (!(bounce = alloca((size_t) widened.size + block_size))) {
215 log_error("Bounce buffer alloca failed");
220 * Realign start of bounce buffer (using the extra sector)
222 if (((uintptr_t) bounce) & mask)
223 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask);
225 /* channel the io through the bounce buffer */
226 if (!_io(&widened, bounce, 0)) {
229 /* FIXME pre-extend the file */
230 memset(bounce, '\n', widened.size);
234 memcpy(bounce + (where->start - widened.start), buffer,
235 (size_t) where->size);
237 /* ... then we write */
238 return _io(&widened, bounce, 1);
241 memcpy(buffer, bounce + (where->start - widened.start),
242 (size_t) where->size);
247 static int _dev_get_size_file(const struct device *dev, uint64_t *size)
249 const char *name = dev_name(dev);
252 if (stat(name, &info)) {
253 log_sys_error("stat", name);
257 *size = info.st_size;
258 *size >>= SECTOR_SHIFT; /* Convert to sectors */
260 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
265 static int _dev_get_size_dev(const struct device *dev, uint64_t *size)
268 const char *name = dev_name(dev);
270 struct disklabel lab;
271 struct dkwedge_info dkw;
273 struct partinfo pinfo;
276 if ((fd = open(name, O_RDONLY)) < 0) {
278 log_sys_error("open", name);
284 if ((*size = lseek (fd, 0, SEEK_END)) < 0) {
285 log_sys_error("lseek SEEK_END", name);
290 if (ioctl(fd, DIOCGDINFO, &lab) < 0) {
291 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) {
292 log_debug("ioctl DIOCGWEDGEINFO", name);
297 *size = dkw.dkw_size;
300 *size /= lab.d_secsize;
302 if ((*size = lseek (fd, 0, SEEK_END)) < 0) {
303 log_sys_error("lseek SEEK_END", name);
308 if (ioctl(fd, DIOCGPART, &pinfo) < 0) {
309 log_debug("ioctl DIOCGPART", name);
314 /* XXX: we could also get the size this way, instead of lseek */
315 if (pinfo.media_blocks)
316 *size = pinfo.media_blocks;
318 if (pinfo.media_blksize)
319 *size /= pinfo.media_blksize;
322 if (ioctl(fd, BLKGETSIZE64, size) < 0) {
323 log_sys_error("ioctl BLKGETSIZE64", name);
325 log_sys_error("close", name);
329 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */
332 log_sys_error("close", name);
334 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
339 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead)
342 long read_ahead_long;
344 if (dev->read_ahead != -1) {
345 *read_ahead = (uint32_t) dev->read_ahead;
352 if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) {
353 log_sys_error("ioctl BLKRAGET", dev_name(dev));
362 *read_ahead = (uint32_t) read_ahead_long;
363 dev->read_ahead = read_ahead_long;
365 log_very_verbose("%s: read_ahead is %u sectors",
366 dev_name(dev), *read_ahead);
371 /*-----------------------------------------------------------------
373 *---------------------------------------------------------------*/
375 int dev_get_size(const struct device *dev, uint64_t *size)
380 if ((dev->flags & DEV_REGULAR))
381 return _dev_get_size_file(dev, size);
383 return _dev_get_size_dev(dev, size);
386 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead)
391 if (dev->flags & DEV_REGULAR) {
396 return _dev_read_ahead_dev(dev, read_ahead);
400 int dev_get_sectsize(struct device *dev, uint32_t *size)
404 const char *name = dev_name(dev);
406 if ((fd = open(name, O_RDONLY)) < 0) {
407 log_sys_error("open", name);
411 if (ioctl(fd, BLKSSZGET, &s) < 0) {
412 log_sys_error("ioctl BLKSSZGET", name);
414 log_sys_error("close", name);
419 log_sys_error("close", name);
421 *size = (uint32_t) s;
423 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
429 void dev_flush(struct device *dev)
432 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
436 if (fsync(dev->fd) >= 0)
442 int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
446 int need_excl = 0, need_rw = 0;
448 if ((flags & O_ACCMODE) == O_RDWR)
451 if ((flags & O_EXCL))
455 if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
456 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
461 if (dev->open_count && !need_excl) {
462 /* FIXME Ensure we never get here */
463 log_debug("WARNING: %s already opened read-only",
468 dev_close_immediate(dev);
472 log_error("WARNING: dev_open(%s) called while suspended",
475 if (dev->flags & DEV_REGULAR)
476 name = dev_name(dev);
477 else if (!(name = dev_name_confirmed(dev, quiet)))
480 if (!(dev->flags & DEV_REGULAR)) {
481 if (stat(name, &buf) < 0) {
482 log_sys_error("%s: stat failed", name);
485 if (buf.st_rdev != dev->dev) {
486 log_error("%s: device changed", name);
491 #ifdef O_DIRECT_SUPPORT
493 if (!(dev->flags & DEV_O_DIRECT_TESTED))
494 dev->flags |= DEV_O_DIRECT;
496 if ((dev->flags & DEV_O_DIRECT))
502 /* Don't update atime on device inodes */
503 if (!(dev->flags & DEV_REGULAR))
507 if ((dev->fd = open(name, flags, 0777)) < 0) {
508 #ifdef O_DIRECT_SUPPORT
509 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
511 if ((dev->fd = open(name, flags, 0777)) >= 0) {
512 dev->flags &= ~DEV_O_DIRECT;
513 log_debug("%s: Not using O_DIRECT", name);
519 log_sys_debug("open", name);
521 log_sys_error("open", name);
526 #ifdef O_DIRECT_SUPPORT
529 dev->flags |= DEV_O_DIRECT_TESTED;
532 dev->flags &= ~DEV_ACCESSED_W;
535 dev->flags |= DEV_OPENED_RW;
537 dev->flags &= ~DEV_OPENED_RW;
540 dev->flags |= DEV_OPENED_EXCL;
542 dev->flags &= ~DEV_OPENED_EXCL;
544 if (!(dev->flags & DEV_REGULAR) &&
545 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
546 log_error("%s: fstat failed: Has device name changed?", name);
547 dev_close_immediate(dev);
551 #ifndef O_DIRECT_SUPPORT
552 if (!(dev->flags & DEV_REGULAR))
556 if ((flags & O_CREAT) && !(flags & O_TRUNC))
557 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
559 dm_list_add(&_open_devices, &dev->open_list);
561 log_debug("Opened %s %s%s%s", dev_name(dev),
562 dev->flags & DEV_OPENED_RW ? "RW" : "RO",
563 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
564 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
569 int dev_open_quiet(struct device *dev)
573 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
575 return dev_open_flags(dev, flags, 1, 1);
578 int dev_open(struct device *dev)
582 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
584 return dev_open_flags(dev, flags, 1, 0);
587 int dev_test_excl(struct device *dev)
592 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
595 r = dev_open_flags(dev, flags, 1, 1);
597 dev_close_immediate(dev);
602 static void _close(struct device *dev)
605 log_sys_error("close", dev_name(dev));
607 dev->block_size = -1;
608 dm_list_del(&dev->open_list);
610 log_debug("Closed %s", dev_name(dev));
612 if (dev->flags & DEV_ALLOCED) {
613 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)->
615 dm_free(dev->aliases.n);
620 static int _dev_close(struct device *dev, int immediate)
622 struct lvmcache_info *info;
625 log_error("Attempt to close device '%s' "
626 "which is not open.", dev_name(dev));
630 #ifndef O_DIRECT_SUPPORT
631 if (dev->flags & DEV_ACCESSED_W)
635 if (dev->open_count > 0)
638 if (immediate && dev->open_count)
639 log_debug("%s: Immediate close attempt while still referenced",
642 /* Close unless device is known to belong to a locked VG */
644 (dev->open_count < 1 &&
645 (!(info = info_from_pvid(dev->pvid, 0)) ||
647 !vgname_is_locked(info->vginfo->vgname))))
653 int dev_close(struct device *dev)
655 return _dev_close(dev, 0);
658 int dev_close_immediate(struct device *dev)
660 return _dev_close(dev, 1);
663 void dev_close_all(void)
665 struct dm_list *doh, *doht;
668 dm_list_iterate_safe(doh, doht, &_open_devices) {
669 dev = dm_list_struct_base(doh, struct device, open_list);
670 if (dev->open_count < 1)
675 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer)
677 struct device_area where;
679 if (!dev->open_count)
683 where.start = offset;
686 return _aligned_io(&where, buffer, 0);
690 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
691 * by (offset,len) and (offset2,len2). Thus, the total size of
692 * 'buf' should be len+len2.
694 int dev_read_circular(struct device *dev, uint64_t offset, size_t len,
695 uint64_t offset2, size_t len2, void *buf)
697 if (!dev_read(dev, offset, len, buf)) {
698 log_error("Read from %s failed", dev_name(dev));
703 * The second region is optional, and allows for
704 * a circular buffer on the device.
709 if (!dev_read(dev, offset2, len2, buf + len)) {
710 log_error("Circular read from %s failed",
718 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
719 * But fails if concurrent processes writing
722 /* FIXME pre-extend the file */
723 int dev_append(struct device *dev, size_t len, void *buffer)
727 if (!dev->open_count)
730 r = dev_write(dev, dev->end, len, buffer);
731 dev->end += (uint64_t) len;
733 #ifndef O_DIRECT_SUPPORT
739 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer)
741 struct device_area where;
743 if (!dev->open_count)
747 where.start = offset;
750 dev->flags |= DEV_ACCESSED_W;
752 return _aligned_io(&where, buffer, 1);
755 int dev_set(struct device *dev, uint64_t offset, size_t len, int value)
758 char buffer[4096] __attribute((aligned(8)));
763 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
764 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t,
765 dev_name(dev), offset, len);
767 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
768 " sectors", dev_name(dev), offset >> SECTOR_SHIFT,
769 len >> SECTOR_SHIFT);
771 memset(buffer, value, sizeof(buffer));
773 s = len > sizeof(buffer) ? sizeof(buffer) : len;
774 if (!dev_write(dev, offset, s, buffer))
784 dev->flags |= DEV_ACCESSED_W;