gitweb.dragonflybsd.org Git - dragonfly.git/blob

1 /*

4 * (c) UNIX System Laboratories, Inc.

5 * All or some portions of this file are derived from material licensed

6 * to the University of California by American Telephone and Telegraph

7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with

8 * the permission of UNIX System Laboratories, Inc.

9 *

10 * Redistribution and use in source and binary forms, with or without

11 * modification, are permitted provided that the following conditions

12 * are met:

13 * 1. Redistributions of source code must retain the above copyright

14 * notice, this list of conditions and the following disclaimer.

15 * 2. Redistributions in binary form must reproduce the above copyright

16 * notice, this list of conditions and the following disclaimer in the

17 * documentation and/or other materials provided with the distribution.

18 * 3. Neither the name of the University nor the names of its contributors

19 * may be used to endorse or promote products derived from this software

20 * without specific prior written permission.

21 *

22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND

23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE

26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

32 * SUCH DAMAGE.

33 *

34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95

35 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $

36 */

38 /*

39 * External virtual filesystem routines

40 */

41 #include "opt_ddb.h"

42 #include "opt_inet.h"

43 #include "opt_inet6.h"

45 #include <sys/param.h>

46 #include <sys/systm.h>

47 #include <sys/uio.h>

48 #include <sys/buf.h>

49 #include <sys/conf.h>

50 #include <sys/dirent.h>

51 #include <sys/endian.h>

52 #include <sys/eventhandler.h>

53 #include <sys/fcntl.h>

54 #include <sys/file.h>

55 #include <sys/kernel.h>

56 #include <sys/kthread.h>

57 #include <sys/malloc.h>

58 #include <sys/mbuf.h>

59 #include <sys/mount.h>

60 #include <sys/caps.h>

61 #include <sys/proc.h>

62 #include <sys/reboot.h>

63 #include <sys/socket.h>

64 #include <sys/stat.h>

65 #include <sys/sysctl.h>

66 #include <sys/syslog.h>

67 #include <sys/unistd.h>

68 #include <sys/vmmeter.h>

69 #include <sys/vnode.h>

71 #include <machine/limits.h>

73 #include <vm/vm.h>

74 #include <vm/vm_object.h>

75 #include <vm/vm_extern.h>

76 #include <vm/vm_kern.h>

77 #include <vm/pmap.h>

78 #include <vm/vm_map.h>

79 #include <vm/vm_page.h>

80 #include <vm/vm_pager.h>

81 #include <vm/vnode_pager.h>

82 #include <vm/vm_zone.h>

84 #include <sys/buf2.h>

85 #include <vm/vm_page2.h>

87 #include <netinet/in.h>

89 static MALLOC_DEFINE(M_NETCRED, "Export Host", "Export host address structure");

91 __read_mostly int numvnodes;

92 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,

93 "Number of vnodes allocated");

94 __read_mostly int verbose_reclaims;

95 SYSCTL_INT(_debug, OID_AUTO, verbose_reclaims, CTLFLAG_RD, &verbose_reclaims, 0,

96 "Output filename of reclaimed vnode(s)");

98 __read_mostly enum vtype iftovt_tab[16] = {

99 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,

100 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,

101 };

102 __read_mostly int vttoif_tab[9] = {

103 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,

104 S_IFSOCK, S_IFIFO, S_IFMT,

105 };

106

107 static int reassignbufcalls;

108 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls,

109 0, "Number of times buffers have been reassigned to the proper list");

110

111 __read_mostly static int check_buf_overlap = 2; /* invasive check */

112 SYSCTL_INT(_vfs, OID_AUTO, check_buf_overlap, CTLFLAG_RW, &check_buf_overlap,

113 0, "Enable overlapping buffer checks");

114

115 int nfs_mount_type = -1;

116 static struct lwkt_token spechash_token;

117 struct nfs_public nfs_pub; /* publicly exported FS */

118

119 __read_mostly int maxvnodes;

120 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,

121 &maxvnodes, 0, "Maximum number of vnodes");

122

123 static struct radix_node_head *vfs_create_addrlist_af(int af,

124 struct netexport *nep);

125 static void vclean_vxlocked(struct vnode *vp, int flags);

126

127 __read_mostly int prtactive = 0; /* 1 => print out reclaim of active vnodes */

128

129 /*

130 * Red black tree functions

131 */

132 static int rb_buf_compare(struct buf *b1, struct buf *b2);

133 RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, off_t, b_loffset);

134 RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, off_t, b_loffset);

135

136 static int

137 rb_buf_compare(struct buf *b1, struct buf *b2)

138 {

139 if (b1->b_loffset < b2->b_loffset)

140 return(-1);

141 if (b1->b_loffset > b2->b_loffset)

142 return(1);

143 return(0);

144 }

145

146 /*

147 * Initialize the vnode management data structures.

148 *

149 * Called from vfsinit()

150 */

151 #define VNBREAKMEM1 (1L * 1024 * 1024 * 1024)

152 #define VNBREAKMEM2 (7L * 1024 * 1024 * 1024)

153 #define MINVNODES 2000

154 #define MAXVNODES 4000000

155

156 void

157 vfs_subr_init(void)

158 {

159 int factor1; /* Limit based on ram (x 2 above 1GB) */

160 size_t freemem;

161

162 /*

163 * Size maxvnodes non-linearly to available memory. Don't bloat

164 * the count on low-memory systems. Scale up for systems with

165 * more than 1G and more than 8G of ram, but do so non-linearly

166 * because the value of a large maxvnodes count diminishes

167 * significantly beyond a certain point.

168 *

169 * The general minimum is maxproc * 8 (we want someone pushing

170 * up maxproc a lot to also get more vnodes). Usually maxproc

171 * does not affect this calculation. The KvaSize limitation also

172 * typically does not affect this calculation (it is just in case

173 * the kernel VM space is made much smaller than main memory, which

174 * should no longer happen on 64-bit systems).

175 *

176 * There isn't much of a point allowing maxvnodes to exceed a

177 * few million as modern filesystems cache pages in the

178 * underlying block device and not so much hanging off of VM

179 * objects.

180 *

181 * Also, VM objects, vnodes, and filesystem inode and other related

182 * structures have gotten a lot larger in recent years and the kernel

183 * memory use tends to scale with maxvnodes, so we don't want to bloat

184 * it too much. But neither do we want the max set too low because

185 * systems with large amounts of memory and cores are capable of

186 * doing a hell of a lot.

187 */

188 factor1 = 80 * (sizeof(struct vm_object) + sizeof(struct vnode));

189

190 freemem = (int64_t)vmstats.v_page_count * PAGE_SIZE;

191

192 maxvnodes = freemem / factor1;

193 if (freemem > VNBREAKMEM1) {

194 freemem -= VNBREAKMEM1;

195 if (freemem < VNBREAKMEM2) {

196 maxvnodes += freemem / factor1 / 2;

197 } else {

198 maxvnodes += VNBREAKMEM2 / factor1 / 2;

199 freemem -= VNBREAKMEM2;

200 maxvnodes += freemem / factor1 / 4;

201 }

202 }

203 maxvnodes = imax(maxvnodes, maxproc * 8);

204 maxvnodes = imin(maxvnodes, KvaSize / factor1);

205 maxvnodes = imin(maxvnodes, MAXVNODES);

206 maxvnodes = imax(maxvnodes, MINVNODES);

207

208 lwkt_token_init(&spechash_token, "spechash");

209 }

210

211 /*

212 * Knob to control the precision of file timestamps:

213 *

214 * 0 = seconds only; nanoseconds zeroed.

215 * 1 = microseconds accurate to tick precision

216 * 2 = microseconds accurate to tick precision (default, hz >= 100)

217 * 3 = nanoseconds accurate to tick precision

218 * 4 = microseconds, maximum precision (default, hz < 100)

219 * 5 = nanoseconds, maximum precision

220 *

221 * Note that utimes() precision is microseconds because it takes a timeval

222 * structure, so its probably best to default to USEC or USEC_PRECISE, and

223 * not NSEC.

224 */

225 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC,

226 TSP_USEC_PRECISE, TSP_NSEC_PRECISE };

227

228 __read_mostly static int timestamp_precision = -1;

229 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,

230 &timestamp_precision, 0, "Precision of file timestamps");

231

232 /*

233 * Get a current timestamp.

234 *

235 * MPSAFE

236 */

237 void

238 vfs_timestamp(struct timespec *tsp)

239 {

240 switch (timestamp_precision) {

241 case TSP_SEC: /* seconds precision */

242 getnanotime(tsp);

243 tsp->tv_nsec = 0;

244 break;

245 case TSP_HZ: /* ticks precision (limit to microseconds) */

246 getnanotime(tsp);

247 tsp->tv_nsec -= tsp->tv_nsec % 1000;

248 break;

249 default:

250 case TSP_USEC: /* microseconds (ticks precision) */

251 getnanotime(tsp);

252 tsp->tv_nsec -= tsp->tv_nsec % 1000;

253 break;

254 case TSP_NSEC: /* nanoseconds (ticks precision) */

255 getnanotime(tsp);

256 break;

257 case TSP_USEC_PRECISE: /* microseconds (high preceision) */

258 nanotime(tsp);

259 tsp->tv_nsec -= tsp->tv_nsec % 1000;

260 break;

261 case TSP_NSEC_PRECISE: /* nanoseconds (high precision) */

262 nanotime(tsp);

263 break;

264 }

265 }

266

267 /*

268 * Set vnode attributes to VNOVAL

269 */

270 void

271 vattr_null(struct vattr *vap)

272 {

273 vap->va_type = VNON;

274 vap->va_size = VNOVAL;

275 vap->va_bytes = VNOVAL;

276 vap->va_mode = VNOVAL;

277 vap->va_nlink = VNOVAL;

278 vap->va_uid = VNOVAL;

279 vap->va_gid = VNOVAL;

280 vap->va_fsid = VNOVAL;

281 vap->va_fileid = VNOVAL;

282 vap->va_blocksize = VNOVAL;

283 vap->va_rmajor = VNOVAL;

284 vap->va_rminor = VNOVAL;

285 vap->va_atime.tv_sec = VNOVAL;

286 vap->va_atime.tv_nsec = VNOVAL;

287 vap->va_mtime.tv_sec = VNOVAL;

288 vap->va_mtime.tv_nsec = VNOVAL;

289 vap->va_ctime.tv_sec = VNOVAL;

290 vap->va_ctime.tv_nsec = VNOVAL;

291 vap->va_flags = VNOVAL;

292 vap->va_gen = VNOVAL;

293 vap->va_vaflags = 0;

294 /* va_*_uuid fields are only valid if related flags are set */

295 }

296

297 /*

298 * Flush out and invalidate all buffers associated with a vnode.

299 *

300 * vp must be locked.

301 */

302 static int vinvalbuf_bp(struct buf *bp, void *data);

303

304 struct vinvalbuf_bp_info {

305 struct vnode *vp;

306 int slptimeo;

307 int lkflags;

308 int flags;

309 int clean;

310 };

311

312 int

313 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)

314 {

315 struct vinvalbuf_bp_info info;

316 vm_object_t object;

317 int error;

318

319 lwkt_gettoken(&vp->v_token);

320

321 /*

322 * If we are being asked to save, call fsync to ensure that the inode

323 * is updated.

324 */

325 if (flags & V_SAVE) {

326 error = bio_track_wait(&vp->v_track_write, slpflag, slptimeo);

327 if (error)

328 goto done;

329 if (!RB_EMPTY(&vp->v_rbdirty_tree)) {

330 if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)) != 0)

331 goto done;

332 #if 0

333 /*

334 * Dirty bufs may be left or generated via races

335 * in circumstances where vinvalbuf() is called on

336 * a vnode not undergoing reclamation. Only

337 * panic if we are trying to reclaim the vnode.

338 */

339 if ((vp->v_flag & VRECLAIMED) &&

340 (bio_track_active(&vp->v_track_write) ||

341 !RB_EMPTY(&vp->v_rbdirty_tree))) {

342 panic("vinvalbuf: dirty bufs");

343 }

344 #endif

345 }

346 }

347 info.slptimeo = slptimeo;

348 info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;

349 if (slpflag & PCATCH)

350 info.lkflags |= LK_PCATCH;

351 info.flags = flags;

352 info.vp = vp;

353

354 /*

355 * Flush the buffer cache until nothing is left, wait for all I/O

356 * to complete. At least one pass is required. We might block

357 * in the pip code so we have to re-check. Order is important.

358 */

359 do {

360 /*

361 * Flush buffer cache

362 */

363 if (!RB_EMPTY(&vp->v_rbclean_tree)) {

364 info.clean = 1;

365 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,

366 NULL, vinvalbuf_bp, &info);

367 }

368 if (!RB_EMPTY(&vp->v_rbdirty_tree)) {

369 info.clean = 0;

370 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

371 NULL, vinvalbuf_bp, &info);

372 }

373

374 /*

375 * Wait for I/O completion.

376 */

377 bio_track_wait(&vp->v_track_write, 0, 0);

378 if ((object = vp->v_object) != NULL)

379 refcount_wait(&object->paging_in_progress, "vnvlbx");

380 } while (bio_track_active(&vp->v_track_write) ||

381 !RB_EMPTY(&vp->v_rbclean_tree) ||

382 !RB_EMPTY(&vp->v_rbdirty_tree));

383

384 /*

385 * Destroy the copy in the VM cache, too.

386 */

387 if ((object = vp->v_object) != NULL) {

388 vm_object_page_remove(object, 0, 0,

389 (flags & V_SAVE) ? TRUE : FALSE);

390 }

391

392 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))

393 panic("vinvalbuf: flush failed");

394 if (!RB_EMPTY(&vp->v_rbhash_tree))

395 panic("vinvalbuf: flush failed, buffers still present");

396 error = 0;

397 done:

398 lwkt_reltoken(&vp->v_token);

399 return (error);

400 }

401

402 static int

403 vinvalbuf_bp(struct buf *bp, void *data)

404 {

405 struct vinvalbuf_bp_info *info = data;

406 int error;

407

408 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {

409 atomic_add_int(&bp->b_refs, 1);

410 error = BUF_TIMELOCK(bp, info->lkflags,

411 "vinvalbuf", info->slptimeo);

412 atomic_subtract_int(&bp->b_refs, 1);

413 if (error == 0) {

414 BUF_UNLOCK(bp);

415 error = ENOLCK;

416 }

417 if (error == ENOLCK)

418 return(0);

419 return (-error);

420 }

421 KKASSERT(bp->b_vp == info->vp);

422

423 /*

424 * Must check clean/dirty status after successfully locking as

425 * it may race.

426 */

427 if ((info->clean && (bp->b_flags & B_DELWRI)) ||

428 (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0)) {

429 BUF_UNLOCK(bp);

430 return(0);

431 }

432

433 /*

434 * NOTE: NO B_LOCKED CHECK. Also no buf_checkwrite()

435 * check. This code will write out the buffer, period.

436 */

437 bremfree(bp);

438 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&

439 (info->flags & V_SAVE)) {

440 cluster_awrite(bp);

441 } else if (info->flags & V_SAVE) {

442 /*

443 * Cannot set B_NOCACHE on a clean buffer as this will

444 * destroy the VM backing store which might actually

445 * be dirty (and unsynchronized).

446 */

447 bp->b_flags |= (B_INVAL | B_RELBUF);

448 brelse(bp);

449 } else {

450 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);

451 brelse(bp);

452 }

453 return(0);

454 }

455

456 /*

457 * Truncate a file's buffer and pages to a specified length. This

458 * is in lieu of the old vinvalbuf mechanism, which performed unneeded

459 * sync activity.

460 *

461 * The vnode must be locked.

462 */

463 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);

464 static int vtruncbuf_bp_trunc(struct buf *bp, void *data);

465 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);

466 static int vtruncbuf_bp_metasync(struct buf *bp, void *data);

467

468 struct vtruncbuf_info {

469 struct vnode *vp;

470 off_t truncloffset;

471 int clean;

472 };

473

474 int

475 vtruncbuf(struct vnode *vp, off_t length, int blksize)

476 {

477 struct vtruncbuf_info info;

478 const char *filename;

479 int count;

480

481 /*

482 * Round up to the *next* block, then destroy the buffers in question.

483 * Since we are only removing some of the buffers we must rely on the

484 * scan count to determine whether a loop is necessary.

485 */

486 if ((count = (int)(length % blksize)) != 0)

487 info.truncloffset = length + (blksize - count);

488 else

489 info.truncloffset = length;

490 info.vp = vp;

491

492 lwkt_gettoken(&vp->v_token);

493 do {

494 info.clean = 1;

495 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,

496 vtruncbuf_bp_trunc_cmp,

497 vtruncbuf_bp_trunc, &info);

498 info.clean = 0;

499 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

500 vtruncbuf_bp_trunc_cmp,

501 vtruncbuf_bp_trunc, &info);

502 } while(count);

503

504 /*

505 * For safety, fsync any remaining metadata if the file is not being

506 * truncated to 0. Since the metadata does not represent the entire

507 * dirty list we have to rely on the hit count to ensure that we get

508 * all of it.

509 */

510 if (length > 0) {

511 do {

512 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

513 vtruncbuf_bp_metasync_cmp,

514 vtruncbuf_bp_metasync, &info);

515 } while (count);

516 }

517

518 /*

519 * Clean out any left over VM backing store.

520 *

521 * It is possible to have in-progress I/O from buffers that were

522 * not part of the truncation. This should not happen if we

523 * are truncating to 0-length.

524 */

525 vnode_pager_setsize(vp, length);

526 bio_track_wait(&vp->v_track_write, 0, 0);

527

528 /*

529 * Debugging only

530 */

531 spin_lock(&vp->v_spin);

532 filename = TAILQ_FIRST(&vp->v_namecache) ?

533 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";

534 spin_unlock(&vp->v_spin);

535

536 /*

537 * Make sure no buffers were instantiated while we were trying

538 * to clean out the remaining VM pages. This could occur due

539 * to busy dirty VM pages being flushed out to disk.

540 */

541 do {

542 info.clean = 1;

543 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,

544 vtruncbuf_bp_trunc_cmp,

545 vtruncbuf_bp_trunc, &info);

546 info.clean = 0;

547 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

548 vtruncbuf_bp_trunc_cmp,

549 vtruncbuf_bp_trunc, &info);

550 if (count) {

551 kprintf("Warning: vtruncbuf(): Had to re-clean %d "

552 "left over buffers in %s\n", count, filename);

553 }

554 } while(count);

555

556 lwkt_reltoken(&vp->v_token);

557

558 return (0);

559 }

560

561 /*

562 * The callback buffer is beyond the new file EOF and must be destroyed.

563 * Note that the compare function must conform to the RB_SCAN's requirements.

564 */

565 static

566 int

567 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)

568 {

569 struct vtruncbuf_info *info = data;

570

571 if (bp->b_loffset >= info->truncloffset)

572 return(0);

573 return(-1);

574 }

575

576 static

577 int

578 vtruncbuf_bp_trunc(struct buf *bp, void *data)

579 {

580 struct vtruncbuf_info *info = data;

581

582 /*

583 * Do not try to use a buffer we cannot immediately lock, but sleep

584 * anyway to prevent a livelock. The code will loop until all buffers

585 * can be acted upon.

586 *

587 * We must always revalidate the buffer after locking it to deal

588 * with MP races.

589 */

590 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {

591 atomic_add_int(&bp->b_refs, 1);

592 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)

593 BUF_UNLOCK(bp);

594 atomic_subtract_int(&bp->b_refs, 1);

595 } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||

596 (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||

597 bp->b_vp != info->vp ||

598 vtruncbuf_bp_trunc_cmp(bp, data)) {

599 BUF_UNLOCK(bp);

600 } else {

601 bremfree(bp);

602 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);

603 brelse(bp);

604 }

605 return(1);

606 }

607

608 /*

609 * Fsync all meta-data after truncating a file to be non-zero. Only metadata

610 * blocks (with a negative loffset) are scanned.

611 * Note that the compare function must conform to the RB_SCAN's requirements.

612 */

613 static int

614 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)

615 {

616 if (bp->b_loffset < 0)

617 return(0);

618 return(1);

619 }

620

621 static int

622 vtruncbuf_bp_metasync(struct buf *bp, void *data)

623 {

624 struct vtruncbuf_info *info = data;

625

626 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {

627 atomic_add_int(&bp->b_refs, 1);

628 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)

629 BUF_UNLOCK(bp);

630 atomic_subtract_int(&bp->b_refs, 1);

631 } else if ((bp->b_flags & B_DELWRI) == 0 ||

632 bp->b_vp != info->vp ||

633 vtruncbuf_bp_metasync_cmp(bp, data)) {

634 BUF_UNLOCK(bp);

635 } else {

636 bremfree(bp);

637 if (bp->b_vp == info->vp)

638 bawrite(bp);

639 else

640 bwrite(bp);

641 }

642 return(1);

643 }

644

645 /*

646 * vfsync - implements a multipass fsync on a file which understands

647 * dependancies and meta-data. The passed vnode must be locked. The

648 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY.

649 *

650 * When fsyncing data asynchronously just do one consolidated pass starting

651 * with the most negative block number. This may not get all the data due

652 * to dependancies.

653 *

654 * When fsyncing data synchronously do a data pass, then a metadata pass,

655 * then do additional data+metadata passes to try to get all the data out.

656 *

657 * Caller must ref the vnode but does not have to lock it.

658 */

659 static int vfsync_wait_output(struct vnode *vp,

660 int (*waitoutput)(struct vnode *, struct thread *));

661 static int vfsync_dummy_cmp(struct buf *bp __unused, void *data __unused);

662 static int vfsync_data_only_cmp(struct buf *bp, void *data);

663 static int vfsync_meta_only_cmp(struct buf *bp, void *data);

664 static int vfsync_lazy_range_cmp(struct buf *bp, void *data);

665 static int vfsync_bp(struct buf *bp, void *data);

666

667 struct vfsync_info {

668 struct vnode *vp;

669 int fastpass;

670 int synchronous;

671 int syncdeps;

672 int lazycount;

673 int lazylimit;

674 int skippedbufs;

675 int (*checkdef)(struct buf *);

676 int (*cmpfunc)(struct buf *, void *);

677 };

678

679 int

680 vfsync(struct vnode *vp, int waitfor, int passes,

681 int (*checkdef)(struct buf *),

682 int (*waitoutput)(struct vnode *, struct thread *))

683 {

684 struct vfsync_info info;

685 int error;

686

687 bzero(&info, sizeof(info));

688 info.vp = vp;

689 if ((info.checkdef = checkdef) == NULL)

690 info.syncdeps = 1;

691

692 lwkt_gettoken(&vp->v_token);

693

694 switch(waitfor) {

695 case MNT_LAZY | MNT_NOWAIT:

696 case MNT_LAZY:

697 /*

698 * Lazy (filesystem syncer typ) Asynchronous plus limit the

699 * number of data (not meta) pages we try to flush to 1MB.

700 * A non-zero return means that lazy limit was reached.

701 */

702 info.lazylimit = 1024 * 1024;

703 info.syncdeps = 1;

704 info.cmpfunc = vfsync_lazy_range_cmp;

705 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

706 vfsync_lazy_range_cmp, vfsync_bp, &info);

707 info.cmpfunc = vfsync_meta_only_cmp;

708 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,

709 vfsync_meta_only_cmp, vfsync_bp, &info);

710 if (error == 0)

711 vp->v_lazyw = 0;

712 else if (!RB_EMPTY(&vp->v_rbdirty_tree))

713 vn_syncer_add(vp, 1);

714 error = 0;

715 break;

716 case MNT_NOWAIT:

717 /*

718 * Asynchronous. Do a data-only pass and a meta-only pass.

719 */

720 info.syncdeps = 1;

721 info.cmpfunc = vfsync_data_only_cmp;

722 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,

723 vfsync_bp, &info);

724 info.cmpfunc = vfsync_meta_only_cmp;

725 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp,

726 vfsync_bp, &info);

727 error = 0;

728 break;

729 default:

730 /*

731 * Synchronous. Do a data-only pass, then a meta-data+data

732 * pass, then additional integrated passes to try to get

733 * all the dependancies flushed.

734 */

735 info.cmpfunc = vfsync_data_only_cmp;

736 info.fastpass = 1;

737 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,

738 vfsync_bp, &info);

739 info.fastpass = 0;

740 error = vfsync_wait_output(vp, waitoutput);

741 if (error == 0) {

742 info.skippedbufs = 0;

743 info.cmpfunc = vfsync_dummy_cmp;

744 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,

745 vfsync_bp, &info);

746 error = vfsync_wait_output(vp, waitoutput);

747 if (info.skippedbufs) {

748 kprintf("Warning: vfsync skipped %d dirty "

749 "buf%s in pass2!\n",

750 info.skippedbufs,

751 ((info.skippedbufs > 1) ? "s" : ""));

752 }

753 }

754 while (error == 0 && passes > 0 &&

755 !RB_EMPTY(&vp->v_rbdirty_tree)

756 ) {

757 info.skippedbufs = 0;

758 if (--passes == 0) {

759 info.synchronous = 1;

760 info.syncdeps = 1;

761 }

762 info.cmpfunc = vfsync_dummy_cmp;

763 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,

764 vfsync_bp, &info);

765 if (error < 0)

766 error = -error;

767 info.syncdeps = 1;

768 if (error == 0)

769 error = vfsync_wait_output(vp, waitoutput);

770 if (info.skippedbufs && passes == 0) {

771 kprintf("Warning: vfsync skipped %d dirty "

772 "buf%s in final pass!\n",

773 info.skippedbufs,

774 ((info.skippedbufs > 1) ? "s" : ""));

775 }

776 }

777 #if 0

778 /*

779 * This case can occur normally because vnode lock might

780 * not be held.

781 */

782 if (!RB_EMPTY(&vp->v_rbdirty_tree))

783 kprintf("dirty bufs left after final pass\n");

784 #endif

785 break;

786 }

787 lwkt_reltoken(&vp->v_token);

788

789 return(error);

790 }

791

792 static int

793 vfsync_wait_output(struct vnode *vp,

794 int (*waitoutput)(struct vnode *, struct thread *))

795 {

796 int error;

797

798 error = bio_track_wait(&vp->v_track_write, 0, 0);

799 if (waitoutput)

800 error = waitoutput(vp, curthread);

801 return(error);

802 }

803

804 static int

805 vfsync_dummy_cmp(struct buf *bp __unused, void *data __unused)

806 {

807 return(0);

808 }

809

810 static int

811 vfsync_data_only_cmp(struct buf *bp, void *data)

812 {

813 if (bp->b_loffset < 0)

814 return(-1);

815 return(0);

816 }

817

818 static int

819 vfsync_meta_only_cmp(struct buf *bp, void *data)

820 {

821 if (bp->b_loffset < 0)

822 return(0);

823 return(1);

824 }

825

826 static int

827 vfsync_lazy_range_cmp(struct buf *bp, void *data)

828 {

829 struct vfsync_info *info = data;

830

831 if (bp->b_loffset < info->vp->v_lazyw)

832 return(-1);

833 return(0);

834 }

835

836 static int

837 vfsync_bp(struct buf *bp, void *data)

838 {

839 struct vfsync_info *info = data;

840 struct vnode *vp = info->vp;

841 int error;

842

843 if (info->fastpass) {

844 /*

845 * Ignore buffers that we cannot immediately lock.

846 */

847 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {

848 /*

849 * Removed BUF_TIMELOCK(..., 1), even a 1-tick

850 * delay can mess up performance

851 *

852 * Another reason is that during a dirty-buffer

853 * scan a clustered write can start I/O on buffers

854 * ahead of the scan, causing the scan to not

855 * get a lock here. Usually this means the write

856 * is already in progress so, in fact, we *want*

857 * to skip the buffer.

858 */

859 ++info->skippedbufs;

860 return(0);

861 }

862 } else if (info->synchronous == 0) {

863 /*

864 * Normal pass, give the buffer a little time to become

865 * available to us.

866 */

867 if (BUF_TIMELOCK(bp, LK_EXCLUSIVE, "bflst2", hz / 10)) {

868 ++info->skippedbufs;

869 return(0);

870 }

871 } else {

872 /*

873 * Synchronous pass, give the buffer a lot of time before

874 * giving up.

875 */

876 if (BUF_TIMELOCK(bp, LK_EXCLUSIVE, "bflst3", hz * 10)) {

877 ++info->skippedbufs;

878 return(0);

879 }

880 }

881

882 /*

883 * We must revalidate the buffer after locking.

884 */

885 if ((bp->b_flags & B_DELWRI) == 0 ||

886 bp->b_vp != info->vp ||

887 info->cmpfunc(bp, data)) {

888 BUF_UNLOCK(bp);

889 return(0);

890 }

891

892 /*

893 * If syncdeps is not set we do not try to write buffers which have

894 * dependancies.

895 */

896 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) {

897 BUF_UNLOCK(bp);

898 return(0);

899 }

900

901 /*

902 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer

903 * has been written but an additional handshake with the device

904 * is required before we can dispose of the buffer. We have no idea

905 * how to do this so we have to skip these buffers.

906 */

907 if (bp->b_flags & B_NEEDCOMMIT) {

908 BUF_UNLOCK(bp);

909 return(0);

910 }

911

912 /*

913 * Ask bioops if it is ok to sync. If not the VFS may have

914 * set B_LOCKED so we have to cycle the buffer.

915 */

916 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) {

917 bremfree(bp);

918 brelse(bp);

919 return(0);

920 }

921

922 if (info->synchronous) {

923 /*

924 * Synchronous flush. An error may be returned and will

925 * stop the scan.

926 */

927 bremfree(bp);

928 error = bwrite(bp);

929 } else {

930 /*

931 * Asynchronous flush. We use the error return to support

932 * MNT_LAZY flushes.

933 *

934 * In low-memory situations we revert to synchronous

935 * operation. This should theoretically prevent the I/O

936 * path from exhausting memory in a non-recoverable way.

937 */

938 vp->v_lazyw = bp->b_loffset;

939 bremfree(bp);

940 if (vm_paging_min()) {

941 /* low memory */

942 info->lazycount += bp->b_bufsize;

943 bwrite(bp);

944 } else {

945 /* normal */

946 info->lazycount += cluster_awrite(bp);

947 waitrunningbufspace();

948 /*vm_wait_nominal();*/

949 }

950 if (info->lazylimit && info->lazycount >= info->lazylimit)

951 error = 1;

952 else

953 error = 0;

954 }

955 return(-error);

956 }

957

958 /*

959 * Associate a buffer with a vnode.

960 *

961 * MPSAFE

962 */

963 int

964 bgetvp(struct vnode *vp, struct buf *bp, int testsize)

965 {

966 KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));

967 KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0);

968

969 /*

970 * Insert onto list for new vnode.

971 */

972 lwkt_gettoken(&vp->v_token);

973

974 if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) {

975 lwkt_reltoken(&vp->v_token);

976 return (EEXIST);

977 }

978

979 /*

980 * Diagnostics (mainly for HAMMER debugging). Check for

981 * overlapping buffers.

982 */

983 if (check_buf_overlap) {

984 struct buf *bx;

985 bx = buf_rb_hash_RB_PREV(bp);

986 if (bx) {

987 if (bx->b_loffset + bx->b_bufsize > bp->b_loffset) {

988 kprintf("bgetvp: overlapl %016jx/%d %016jx "

989 "bx %p bp %p\n",

990 (intmax_t)bx->b_loffset,

991 bx->b_bufsize,

992 (intmax_t)bp->b_loffset,

993 bx, bp);

994 if (check_buf_overlap > 1)

995 panic("bgetvp - overlapping buffer");

996 }

997 }

998 bx = buf_rb_hash_RB_NEXT(bp);

999 if (bx) {

1000 if (bp->b_loffset + testsize > bx->b_loffset) {

1001 kprintf("bgetvp: overlapr %016jx/%d %016jx "

1002 "bp %p bx %p\n",

1003 (intmax_t)bp->b_loffset,

1004 testsize,

1005 (intmax_t)bx->b_loffset,

1006 bp, bx);

1007 if (check_buf_overlap > 1)

1008 panic("bgetvp - overlapping buffer");

1009 }

1010 }

1011 }

1012 bp->b_vp = vp;

1013 bp->b_flags |= B_HASHED;

1014 bp->b_flags |= B_VNCLEAN;

1015 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))

1016 panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp);

1017 /*vhold(vp);*/

1018 lwkt_reltoken(&vp->v_token);

1019 return(0);

1020 }

1021

1022 /*

1023 * Disassociate a buffer from a vnode.

1024 *

1025 * MPSAFE

1026 */

1027 void

1028 brelvp(struct buf *bp)

1029 {

1030 struct vnode *vp;

1031

1032 KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));

1033

1034 /*

1035 * Delete from old vnode list, if on one.

1036 */

1037 vp = bp->b_vp;

1038 lwkt_gettoken(&vp->v_token);

1039 if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN)) {

1040 if (bp->b_flags & B_VNDIRTY)

1041 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);

1042 else

1043 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);

1044 bp->b_flags &= ~(B_VNDIRTY | B_VNCLEAN);

1045 }

1046 if (bp->b_flags & B_HASHED) {

1047 buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);

1048 bp->b_flags &= ~B_HASHED;

1049 }

1050

1051 /*

1052 * Only remove from synclist when no dirty buffers are left AND

1053 * the VFS has not flagged the vnode's inode as being dirty.

1054 */

1055 if ((vp->v_flag & (VONWORKLST | VISDIRTY | VOBJDIRTY)) == VONWORKLST &&

1056 RB_EMPTY(&vp->v_rbdirty_tree)) {

1057 vn_syncer_remove(vp, 0);

1058 }

1059 bp->b_vp = NULL;

1060

1061 lwkt_reltoken(&vp->v_token);

1062

1063 /*vdrop(vp);*/

1064 }

1065

1066 /*

1067 * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.

1068 * This routine is called when the state of the B_DELWRI bit is changed.

1069 *

1070 * Must be called with vp->v_token held.

1071 * MPSAFE

1072 */

1073 void

1074 reassignbuf(struct buf *bp)

1075 {

1076 struct vnode *vp = bp->b_vp;

1077 int delay;

1078

1079 ASSERT_LWKT_TOKEN_HELD(&vp->v_token);

1080 ++reassignbufcalls;

1081

1082 /*

1083 * B_PAGING flagged buffers cannot be reassigned because their vp

1084 * is not fully linked in.

1085 */

1086 if (bp->b_flags & B_PAGING)

1087 panic("cannot reassign paging buffer");

1088

1089 if (bp->b_flags & B_DELWRI) {

1090 /*

1091 * Move to the dirty list, add the vnode to the worklist

1092 */

1093 if (bp->b_flags & B_VNCLEAN) {

1094 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);

1095 bp->b_flags &= ~B_VNCLEAN;

1096 }

1097 if ((bp->b_flags & B_VNDIRTY) == 0) {

1098 if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) {

1099 panic("reassignbuf: dup lblk vp %p bp %p",

1100 vp, bp);

1101 }

1102 bp->b_flags |= B_VNDIRTY;

1103 }

1104 if ((vp->v_flag & VONWORKLST) == 0) {

1105 switch (vp->v_type) {

1106 case VDIR:

1107 delay = dirdelay;

1108 break;

1109 case VCHR:

1110 case VBLK:

1111 if (vp->v_rdev &&

1112 vp->v_rdev->si_mountpoint != NULL) {

1113 delay = metadelay;

1114 break;

1115 }

1116 /* fall through */

1117 default:

1118 delay = filedelay;

1119 }

1120 vn_syncer_add(vp, delay);

1121 }

1122 } else {

1123 /*

1124 * Move to the clean list, remove the vnode from the worklist

1125 * if no dirty blocks remain.

1126 */

1127 if (bp->b_flags & B_VNDIRTY) {

1128 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);

1129 bp->b_flags &= ~B_VNDIRTY;

1130 }

1131 if ((bp->b_flags & B_VNCLEAN) == 0) {

1132 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) {

1133 panic("reassignbuf: dup lblk vp %p bp %p",

1134 vp, bp);

1135 }

1136 bp->b_flags |= B_VNCLEAN;

1137 }

1138

1139 /*

1140 * Only remove from synclist when no dirty buffers are left

1141 * AND the VFS has not flagged the vnode's inode as being

1142 * dirty.

1143 */

1144 if ((vp->v_flag & (VONWORKLST | VISDIRTY | VOBJDIRTY)) ==

1145 VONWORKLST &&

1146 RB_EMPTY(&vp->v_rbdirty_tree)) {

1147 vn_syncer_remove(vp, 0);

1148 }

1149 }

1150 }

1151

1152 /*

1153 * Create a vnode for a block device. Used for mounting the root file

1154 * system.

1155 *

1156 * A vref()'d vnode is returned.

1157 */

1158 extern struct vop_ops *devfs_vnode_dev_vops_p;

1159 int

1160 bdevvp(cdev_t dev, struct vnode **vpp)

1161 {

1162 struct vnode *vp;

1163 struct vnode *nvp;

1164 int error;

1165

1166 if (dev == NULL) {

1167 *vpp = NULLVP;

1168 return (ENXIO);

1169 }

1170 error = getspecialvnode(VT_NON, NULL, &devfs_vnode_dev_vops_p,

1171 &nvp, 0, 0);

1172 if (error) {

1173 *vpp = NULLVP;

1174 return (error);

1175 }

1176 vp = nvp;

1177 vp->v_type = VCHR;

1178 #if 0

1179 vp->v_rdev = dev;

1180 #endif

1181 v_associate_rdev(vp, dev);

1182 vp->v_umajor = dev->si_umajor;

1183 vp->v_uminor = dev->si_uminor;

1184 vx_unlock(vp);

1185 *vpp = vp;

1186 return (0);

1187 }

1188

1189 int

1190 v_associate_rdev(struct vnode *vp, cdev_t dev)

1191 {

1192 if (dev == NULL)

1193 return(ENXIO);

1194 if (dev_is_good(dev) == 0)

1195 return(ENXIO);

1196 KKASSERT(vp->v_rdev == NULL);

1197 vp->v_rdev = reference_dev(dev);

1198 lwkt_gettoken(&spechash_token);

1199 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_cdevnext);

1200 lwkt_reltoken(&spechash_token);

1201 return(0);

1202 }

1203

1204 void

1205 v_release_rdev(struct vnode *vp)

1206 {

1207 cdev_t dev;

1208

1209 if ((dev = vp->v_rdev) != NULL) {

1210 lwkt_gettoken(&spechash_token);

1211 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_cdevnext);

1212 vp->v_rdev = NULL;

1213 release_dev(dev);

1214 lwkt_reltoken(&spechash_token);

1215 }

1216 }

1217

1218 /*

1219 * Add a vnode to the alias list hung off the cdev_t. We only associate

1220 * the device number with the vnode. The actual device is not associated

1221 * until the vnode is opened (usually in spec_open()), and will be

1222 * disassociated on last close.

1223 */

1224 void

1225 addaliasu(struct vnode *nvp, int x, int y)

1226 {

1227 if (nvp->v_type != VBLK && nvp->v_type != VCHR)

1228 panic("addaliasu on non-special vnode");

1229 nvp->v_umajor = x;

1230 nvp->v_uminor = y;

1231 }

1232

1233 /*

1234 * Simple call that a filesystem can make to try to get rid of a

1235 * vnode. It will fail if anyone is referencing the vnode (including

1236 * the caller).

1237 *

1238 * The filesystem can check whether its in-memory inode structure still

1239 * references the vp on return.

1240 *

1241 * May only be called if the vnode is in a known state (i.e. being prevented

1242 * from being deallocated by some other condition such as a vfs inode hold).

1243 *

1244 * This call might not succeed.

1245 */

1246 void

1247 vclean_unlocked(struct vnode *vp)

1248 {

1249 vx_get(vp);

1250 if (VREFCNT(vp) <= 1)

1251 vgone_vxlocked(vp);

1252 vx_put(vp);

1253 }

1254

1255 /*

1256 * Disassociate a vnode from its underlying filesystem.

1257 *

1258 * The vnode must be VX locked and referenced. In all normal situations

1259 * there are no active references. If vclean_vxlocked() is called while

1260 * there are active references, the vnode is being ripped out and we have

1261 * to call VOP_CLOSE() as appropriate before we can reclaim it.

1262 */

1263 static void

1264 vclean_vxlocked(struct vnode *vp, int flags)

1265 {

1266 int active;

1267 int n;

1268 vm_object_t object;

1269 struct namecache *ncp;

1270

1271 /*

1272 * If the vnode has already been reclaimed we have nothing to do.

1273 */

1274 if (vp->v_flag & VRECLAIMED)

1275 return;

1276

1277 /*

1278 * Set flag to interlock operation, flag finalization to ensure

1279 * that the vnode winds up on the inactive list, and set v_act to 0.

1280 */

1281 vsetflags(vp, VRECLAIMED);

1282 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);

1283 vp->v_act = 0;

1284

1285 if (verbose_reclaims) {

1286 if ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL)

1287 kprintf("Debug: reclaim %p %s\n", vp, ncp->nc_name);

1288 }

1289

1290 /*

1291 * Scrap the vfs cache

1292 */

1293 while (cache_inval_vp(vp, 0) != 0) {

1294 kprintf("Warning: vnode %p clean/cache_resolution "

1295 "race detected\n", vp);

1296 tsleep(vp, 0, "vclninv", 2);

1297 }

1298

1299 /*

1300 * Check to see if the vnode is in use. If so we have to reference it

1301 * before we clean it out so that its count cannot fall to zero and

1302 * generate a race against ourselves to recycle it.

1303 */

1304 active = (VREFCNT(vp) > 0);

1305

1306 /*

1307 * Clean out any buffers associated with the vnode and destroy its

1308 * object, if it has one.

1309 */

1310 vinvalbuf(vp, V_SAVE, 0, 0);

1311

1312 /*

1313 * If purging an active vnode (typically during a forced unmount

1314 * or reboot), it must be closed and deactivated before being

1315 * reclaimed. This isn't really all that safe, but what can

1316 * we do? XXX.

1317 *

1318 * Note that neither of these routines unlocks the vnode.

1319 */

1320 if (active && (flags & DOCLOSE)) {

1321 while ((n = vp->v_opencount) != 0) {

1322 if (vp->v_writecount)

1323 VOP_CLOSE(vp, FWRITE|FNONBLOCK, NULL);

1324 else

1325 VOP_CLOSE(vp, FNONBLOCK, NULL);

1326 if (vp->v_opencount == n) {

1327 kprintf("Warning: unable to force-close"

1328 " vnode %p\n", vp);

1329 break;

1330 }

1331 }

1332 }

1333

1334 /*

1335 * If the vnode has not been deactivated, deactivated it. Deactivation

1336 * can create new buffers and VM pages so we have to call vinvalbuf()

1337 * again to make sure they all get flushed.

1338 *

1339 * This can occur if a file with a link count of 0 needs to be

1340 * truncated.

1341 *

1342 * If the vnode is already dead don't try to deactivate it.

1343 */

1344 if ((vp->v_flag & VINACTIVE) == 0) {

1345 vsetflags(vp, VINACTIVE);

1346 if (vp->v_mount)

1347 VOP_INACTIVE(vp);

1348 vinvalbuf(vp, V_SAVE, 0, 0);

1349 }

1350

1351 /*

1352 * If the vnode has an object, destroy it.

1353 */

1354 while ((object = vp->v_object) != NULL) {

1355 vm_object_hold(object);

1356 if (object == vp->v_object)

1357 break;

1358 vm_object_drop(object);

1359 }

1360

1361 if (object != NULL) {

1362 if (object->ref_count == 0) {

1363 if ((object->flags & OBJ_DEAD) == 0)

1364 vm_object_terminate(object);

1365 vm_object_drop(object);

1366 vclrflags(vp, VOBJBUF);

1367 } else {

1368 vm_pager_deallocate(object);

1369 vclrflags(vp, VOBJBUF);

1370 vm_object_drop(object);

1371 }

1372 }

1373 KKASSERT((vp->v_flag & VOBJBUF) == 0);

1374

1375 if (vp->v_flag & VOBJDIRTY)

1376 vclrobjdirty(vp);

1377

1378 /*

1379 * Reclaim the vnode if not already dead.

1380 */

1381 if (vp->v_mount && VOP_RECLAIM(vp))

1382 panic("vclean: cannot reclaim");

1383

1384 /*

1385 * Done with purge, notify sleepers of the grim news.

1386 */

1387 vp->v_ops = &dead_vnode_vops_p;

1388 vn_gone(vp);

1389 vp->v_tag = VT_NON;

1390

1391 /*

1392 * If we are destroying an active vnode, reactivate it now that

1393 * we have reassociated it with deadfs. This prevents the system

1394 * from crashing on the vnode due to it being unexpectedly marked

1395 * as inactive or reclaimed.

1396 */

1397 if (active && (flags & DOCLOSE)) {

1398 vclrflags(vp, VINACTIVE | VRECLAIMED);

1399 }

1400 }

1401

1402 /*

1403 * Eliminate all activity associated with the requested vnode

1404 * and with all vnodes aliased to the requested vnode.

1405 *

1406 * The vnode must be referenced but should not be locked.

1407 */

1408 int

1409 vrevoke(struct vnode *vp, struct ucred *cred)

1410 {

1411 struct vnode *vq;

1412 struct vnode *vqn;

1413 cdev_t dev;

1414 int error;

1415

1416 /*

1417 * If the vnode has a device association, scrap all vnodes associated

1418 * with the device. Don't let the device disappear on us while we

1419 * are scrapping the vnodes.

1420 *

1421 * The passed vp will probably show up in the list, do not VX lock

1422 * it twice!

1423 *

1424 * Releasing the vnode's rdev here can mess up specfs's call to

1425 * device close, so don't do it. The vnode has been disassociated

1426 * and the device will be closed after the last ref on the related

1427 * fp goes away (if not still open by e.g. the kernel).

1428 */

1429 if (vp->v_type != VCHR) {

1430 error = fdrevoke(vp, DTYPE_VNODE, cred);

1431 return (error);

1432 }

1433 if ((dev = vp->v_rdev) == NULL) {

1434 return(0);

1435 }

1436 reference_dev(dev);

1437 lwkt_gettoken(&spechash_token);

1438

1439 restart:

1440 vqn = SLIST_FIRST(&dev->si_hlist);

1441 if (vqn)

1442 vhold(vqn);

1443 while ((vq = vqn) != NULL) {

1444 if (VREFCNT(vq) > 0) {

1445 vref(vq);

1446 fdrevoke(vq, DTYPE_VNODE, cred);

1447 /*v_release_rdev(vq);*/

1448 vrele(vq);

1449 if (vq->v_rdev != dev) {

1450 vdrop(vq);

1451 goto restart;

1452 }

1453 }

1454 vqn = SLIST_NEXT(vq, v_cdevnext);

1455 if (vqn)

1456 vhold(vqn);

1457 vdrop(vq);

1458 }

1459 lwkt_reltoken(&spechash_token);

1460 dev_drevoke(dev);

1461 release_dev(dev);

1462 return (0);

1463 }

1464

1465 /*

1466 * This is called when the object underlying a vnode is being destroyed,

1467 * such as in a remove(). Try to recycle the vnode immediately if the

1468 * only active reference is our reference.

1469 *

1470 * Directory vnodes in the namecache with children cannot be immediately

1471 * recycled because numerous VOP_N*() ops require them to be stable.

1472 *

1473 * To avoid recursive recycling from VOP_INACTIVE implemenetations this

1474 * function is a NOP if VRECLAIMED is already set.

1475 */

1476 int

1477 vrecycle(struct vnode *vp)

1478 {

1479 if (VREFCNT(vp) <= 1 && (vp->v_flag & VRECLAIMED) == 0) {

1480 if (cache_inval_vp_nonblock(vp))

1481 return(0);

1482 vgone_vxlocked(vp);

1483 return (1);

1484 }

1485 return (0);

1486 }

1487

1488 /*

1489 * Return the maximum I/O size allowed for strategy calls on VP.

1490 *

1491 * If vp is VCHR or VBLK we dive the device, otherwise we use

1492 * the vp's mount info.

1493 *

1494 * The returned value is clamped at MAXPHYS as most callers cannot use

1495 * buffers larger than that size.

1496 */

1497 int

1498 vmaxiosize(struct vnode *vp)

1499 {

1500 int maxiosize;

1501

1502 if (vp->v_type == VBLK || vp->v_type == VCHR)

1503 maxiosize = vp->v_rdev->si_iosize_max;

1504 else

1505 maxiosize = vp->v_mount->mnt_iosize_max;

1506

1507 if (maxiosize > MAXPHYS)

1508 maxiosize = MAXPHYS;

1509 return (maxiosize);

1510 }

1511

1512 /*

1513 * Eliminate all activity associated with a vnode in preparation for

1514 * destruction.

1515 *

1516 * The vnode must be VX locked and refd and will remain VX locked and refd

1517 * on return. This routine may be called with the vnode in any state, as

1518 * long as it is VX locked. The vnode will be cleaned out and marked

1519 * VRECLAIMED but will not actually be reused until all existing refs and

1520 * holds go away.

1521 *

1522 * NOTE: This routine may be called on a vnode which has not yet been

1523 * already been deactivated (VOP_INACTIVE), or on a vnode which has

1524 * already been reclaimed.

1525 *

1526 * This routine is not responsible for placing us back on the freelist.

1527 * Instead, it happens automatically when the caller releases the VX lock

1528 * (assuming there aren't any other references).

1529 */

1530 void

1531 vgone_vxlocked(struct vnode *vp)

1532 {

1533 /*

1534 * assert that the VX lock is held. This is an absolute requirement

1535 * now for vgone_vxlocked() to be called.

1536 */

1537 KKASSERT(lockinuse(&vp->v_lock));

1538

1539 /*

1540 * Clean out the filesystem specific data and set the VRECLAIMED

1541 * bit. Also deactivate the vnode if necessary.

1542 *

1543 * The vnode should have automatically been removed from the syncer

1544 * list as syncer/dirty flags cleared during the cleaning.

1545 */

1546 vclean_vxlocked(vp, DOCLOSE);

1547

1548 /*

1549 * Normally panic if the vnode is still dirty, unless we are doing

1550 * a forced unmount (tmpfs typically).

1551 */

1552 if (vp->v_flag & VONWORKLST) {

1553 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {

1554 /* force removal */

1555 vn_syncer_remove(vp, 1);

1556 } else {

1557 panic("vp %p still dirty in vgone after flush", vp);

1558 }

1559 }

1560

1561 /*

1562 * Delete from old mount point vnode list, if on one.

1563 */

1564 if (vp->v_mount != NULL) {

1565 KKASSERT(vp->v_data == NULL);

1566 insmntque(vp, NULL);

1567 }

1568

1569 /*

1570 * If special device, remove it from special device alias list

1571 * if it is on one. This should normally only occur if a vnode is

1572 * being revoked as the device should otherwise have been released

1573 * naturally.

1574 */

1575 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {

1576 v_release_rdev(vp);

1577 }

1578

1579 /*

1580 * Set us to VBAD

1581 */

1582 vp->v_type = VBAD;

1583 }

1584

1585 /*

1586 * Calculate the total number of references to a special device. This

1587 * routine may only be called for VBLK and VCHR vnodes since v_rdev is

1588 * an overloaded field. Since dev_from_devid() can now return NULL, we

1589 * have to check for a NULL v_rdev.

1590 */

1591 int

1592 count_dev(cdev_t dev)

1593 {

1594 struct vnode *vp;

1595 int count = 0;

1596

1597 if (SLIST_FIRST(&dev->si_hlist)) {

1598 lwkt_gettoken(&spechash_token);

1599 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) {

1600 count += vp->v_opencount;

1601 }

1602 lwkt_reltoken(&spechash_token);

1603 }

1604 return(count);

1605 }

1606

1607 int

1608 vcount(struct vnode *vp)

1609 {

1610 if (vp->v_rdev == NULL)

1611 return(0);

1612 return(count_dev(vp->v_rdev));

1613 }

1614

1615 /*

1616 * Initialize VMIO for a vnode. This routine MUST be called before a

1617 * VFS can issue buffer cache ops on a vnode. It is typically called

1618 * when a vnode is initialized from its inode.

1619 */

1620 int

1621 vinitvmio(struct vnode *vp, off_t filesize, int blksize, int boff)

1622 {

1623 vm_object_t object;

1624 int error = 0;

1625

1626 object = vp->v_object;

1627 if (object) {

1628 vm_object_hold(object);

1629 KKASSERT(vp->v_object == object);

1630 }

1631

1632 if (object == NULL) {

1633 object = vnode_pager_alloc(vp, filesize, 0, 0, blksize, boff);

1634

1635 /*

1636 * Dereference the reference we just created. This assumes

1637 * that the object is associated with the vp. Allow it to

1638 * have zero refs. It cannot be destroyed as long as it

1639 * is associated with the vnode.

1640 */

1641 vm_object_hold(object);

1642 atomic_add_int(&object->ref_count, -1);

1643 vrele(vp);

1644 } else {

1645 KKASSERT((object->flags & OBJ_DEAD) == 0);

1646 }

1647 KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object"));

1648 vsetflags(vp, VOBJBUF);

1649 vm_object_drop(object);

1650

1651 return (error);

1652 }

1653

1654

1655 /*

1656 * Print out a description of a vnode.

1657 */

1658 static char *typename[] =

1659 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};

1660

1661 void

1662 vprint(char *label, struct vnode *vp)

1663 {

1664 char buf[96];

1665

1666 if (label != NULL)

1667 kprintf("%s: %p: ", label, (void *)vp);

1668 else

1669 kprintf("%p: ", (void *)vp);

1670 kprintf("type %s, refcnt %08x, writecount %d, holdcnt %d,",

1671 typename[vp->v_type],

1672 vp->v_refcnt, vp->v_writecount, vp->v_auxrefs);

1673 buf[0] = '\0';

1674 if (vp->v_flag & VROOT)

1675 strcat(buf, "|VROOT");

1676 if (vp->v_flag & VPFSROOT)

1677 strcat(buf, "|VPFSROOT");

1678 if (vp->v_flag & VTEXT)

1679 strcat(buf, "|VTEXT");

1680 if (vp->v_flag & VSYSTEM)

1681 strcat(buf, "|VSYSTEM");

1682 if (vp->v_flag & VOBJBUF)

1683 strcat(buf, "|VOBJBUF");

1684 if (buf[0] != '\0')

1685 kprintf(" flags (%s)", &buf[1]);

1686 if (vp->v_data == NULL) {

1687 kprintf("\n");

1688 } else {

1689 kprintf("\n\t");

1690 VOP_PRINT(vp);

1691 }

1692 }

1693

1694 /*

1695 * Do the usual access checking.

1696 * file_mode, uid and gid are from the vnode in question,

1697 * while acc_mode and cred are from the VOP_ACCESS parameter list

1698 */

1699 int

1700 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,

1701 mode_t acc_mode, struct ucred *cred)

1702 {

1703 mode_t mask;

1704 int ismember;

1705

1706 /*

1707 * Super-user always gets read/write access, but execute access depends

1708 * on at least one execute bit being set.

1709 */

1710 if (caps_priv_check(cred, SYSCAP_RESTRICTEDROOT) == 0) {

1711 if ((acc_mode & VEXEC) && type != VDIR &&

1712 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)

1713 return (EACCES);

1714 return (0);

1715 }

1716

1717 mask = 0;

1718

1719 /* Otherwise, check the owner. */

1720 if (cred->cr_uid == uid) {

1721 if (acc_mode & VEXEC)

1722 mask |= S_IXUSR;

1723 if (acc_mode & VREAD)

1724 mask |= S_IRUSR;

1725 if (acc_mode & VWRITE)

1726 mask |= S_IWUSR;

1727 return ((file_mode & mask) == mask ? 0 : EACCES);

1728 }

1729

1730 /* Otherwise, check the groups. */

1731 ismember = groupmember(gid, cred);

1732 if (cred->cr_svgid == gid || ismember) {

1733 if (acc_mode & VEXEC)

1734 mask |= S_IXGRP;

1735 if (acc_mode & VREAD)

1736 mask |= S_IRGRP;

1737 if (acc_mode & VWRITE)

1738 mask |= S_IWGRP;

1739 return ((file_mode & mask) == mask ? 0 : EACCES);

1740 }

1741

1742 /* Otherwise, check everyone else. */

1743 if (acc_mode & VEXEC)

1744 mask |= S_IXOTH;

1745 if (acc_mode & VREAD)

1746 mask |= S_IROTH;

1747 if (acc_mode & VWRITE)

1748 mask |= S_IWOTH;

1749 return ((file_mode & mask) == mask ? 0 : EACCES);

1750 }

1751

1752 #ifdef DDB

1753 #include <ddb/ddb.h>

1754

1755 static int db_show_locked_vnodes(struct mount *mp, void *data);

1756

1757 /*

1758 * List all of the locked vnodes in the system.

1759 * Called when debugging the kernel.

1760 */

1761 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)

1762 {

1763 kprintf("Locked vnodes\n");

1764 mountlist_scan(db_show_locked_vnodes, NULL,

1765 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);

1766 }

1767

1768 static int

1769 db_show_locked_vnodes(struct mount *mp, void *data __unused)

1770 {

1771 struct vnode *vp;

1772

1773 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {

1774 if (vn_islocked(vp))

1775 vprint(NULL, vp);

1776 }

1777 return(0);

1778 }

1779 #endif

1780

1781 /*

1782 * Top level filesystem related information gathering.

1783 */

1784 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);

1785

1786 static int

1787 vfs_sysctl(SYSCTL_HANDLER_ARGS)

1788 {

1789 int *name = (int *)arg1 - 1; /* XXX */

1790 u_int namelen = arg2 + 1; /* XXX */

1791 struct vfsconf *vfsp;

1792 int maxtypenum;

1793

1794 #if 1 || defined(COMPAT_PRELITE2)

1795 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */

1796 if (namelen == 1)

1797 return (sysctl_ovfs_conf(oidp, arg1, arg2, req));

1798 #endif

1799

1800 #ifdef notyet

1801 /* all sysctl names at this level are at least name and field */

1802 if (namelen < 2)

1803 return (ENOTDIR); /* overloaded */

1804 if (name[0] != VFS_GENERIC) {

1805 vfsp = vfsconf_find_by_typenum(name[0]);

1806 if (vfsp == NULL)

1807 return (EOPNOTSUPP);

1808 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,

1809 oldp, oldlenp, newp, newlen, p));

1810 }

1811 #endif

1812 switch (name[1]) {

1813 case VFS_MAXTYPENUM:

1814 if (namelen != 2)

1815 return (ENOTDIR);

1816 maxtypenum = vfsconf_get_maxtypenum();

1817 return (SYSCTL_OUT(req, &maxtypenum, sizeof(maxtypenum)));

1818 case VFS_CONF:

1819 if (namelen != 3)

1820 return (ENOTDIR); /* overloaded */

1821 vfsp = vfsconf_find_by_typenum(name[2]);

1822 if (vfsp == NULL)

1823 return (EOPNOTSUPP);

1824 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));

1825 }

1826 return (EOPNOTSUPP);

1827 }

1828

1829 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,

1830 "Generic filesystem");

1831

1832 #if 1 || defined(COMPAT_PRELITE2)

1833

1834 static int

1835 sysctl_ovfs_conf_iter(struct vfsconf *vfsp, void *data)

1836 {

1837 int error;

1838 struct ovfsconf ovfs;

1839 struct sysctl_req *req = (struct sysctl_req*) data;

1840

1841 bzero(&ovfs, sizeof(ovfs));

1842 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */

1843 strcpy(ovfs.vfc_name, vfsp->vfc_name);

1844 ovfs.vfc_index = vfsp->vfc_typenum;

1845 ovfs.vfc_refcount = vfsp->vfc_refcount;

1846 ovfs.vfc_flags = vfsp->vfc_flags;

1847 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);

1848 if (error)

1849 return error; /* abort iteration with error code */

1850 else

1851 return 0; /* continue iterating with next element */

1852 }

1853

1854 static int

1855 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)

1856 {

1857 return vfsconf_each(sysctl_ovfs_conf_iter, (void*)req);

1858 }

1859

1860 #endif /* 1 || COMPAT_PRELITE2 */

1861

1862 /*

1863 * Check to see if a filesystem is mounted on a block device.

1864 */

1865 int

1866 vfs_mountedon(struct vnode *vp)

1867 {

1868 cdev_t dev;

1869

1870 dev = vp->v_rdev;

1871 if (dev != NULL && dev->si_mountpoint)

1872 return (EBUSY);

1873 return (0);

1874 }

1875

1876 /*

1877 * Unmount all filesystems. The list is traversed in reverse order

1878 * of mounting to avoid dependencies.

1879 *

1880 * We want the umountall to be able to break out of its loop if a

1881 * failure occurs, after scanning all possible mounts, so the callback

1882 * returns 0 on error.

1883 *

1884 * NOTE: Do not call mountlist_remove(mp) on error any more, this will

1885 * confuse mountlist_scan()'s unbusy check.

1886 */

1887 static int vfs_umountall_callback(struct mount *mp, void *data);

1888

1889 void

1890 vfs_unmountall(int halting)

1891 {

1892 int count;

1893

1894 do {

1895 count = mountlist_scan(vfs_umountall_callback, &halting,

1896 MNTSCAN_REVERSE|MNTSCAN_NOBUSY);

1897 } while (count);

1898 }

1899

1900 static

1901 int

1902 vfs_umountall_callback(struct mount *mp, void *data)

1903 {

1904 int error;

1905 int halting = *(int *)data;

1906

1907 /*

1908 * NOTE: When halting, dounmount will disconnect but leave

1909 * certain mount points intact. e.g. devfs.

1910 */

1911 error = dounmount(mp, MNT_FORCE, halting);

1912 if (error) {

1913 kprintf("unmount of filesystem mounted from %s failed (",

1914 mp->mnt_stat.f_mntfromname);

1915 if (error == EBUSY)

1916 kprintf("BUSY)\n");

1917 else

1918 kprintf("%d)\n", error);

1919 return 0;

1920 } else {

1921 return 1;

1922 }

1923 }

1924

1925 /*

1926 * Checks the mount flags for parameter mp and put the names comma-separated

1927 * into a string buffer buf with a size limit specified by len.

1928 *

1929 * It returns the number of bytes written into buf, and (*errorp) will be

1930 * set to 0, EINVAL (if passed length is 0), or ENOSPC (supplied buffer was

1931 * not large enough). The buffer will be 0-terminated if len was not 0.

1932 */

1933 size_t

1934 vfs_flagstostr(int flags, const struct mountctl_opt *optp,

1935 char *buf, size_t len, int *errorp)

1936 {

1937 static const struct mountctl_opt optnames[] = {

1938 { MNT_RDONLY, "read-only" },

1939 { MNT_SYNCHRONOUS, "synchronous" },

1940 { MNT_NOEXEC, "noexec" },

1941 { MNT_NOSUID, "nosuid" },

1942 { MNT_NODEV, "nodev" },

1943 { MNT_AUTOMOUNTED, "automounted" },

1944 { MNT_ASYNC, "asynchronous" },

1945 { MNT_SUIDDIR, "suiddir" },

1946 { MNT_SOFTDEP, "soft-updates" },

1947 { MNT_NOSYMFOLLOW, "nosymfollow" },

1948 { MNT_TRIM, "trim" },

1949 { MNT_NOATIME, "noatime" },

1950 { MNT_NOCLUSTERR, "noclusterr" },

1951 { MNT_NOCLUSTERW, "noclusterw" },

1952 { MNT_EXRDONLY, "NFS read-only" },

1953 { MNT_EXPORTED, "NFS exported" },

1954 /* Remaining NFS flags could come here */

1955 { MNT_LOCAL, "local" },

1956 { MNT_QUOTA, "with-quotas" },

1957 /* { MNT_ROOTFS, "rootfs" }, */

1958 /* { MNT_IGNORE, "ignore" }, */

1959 { 0, NULL}

1960 };

1961 int bwritten;

1962 int bleft;

1963 int optlen;

1964 int actsize;

1965

1966 *errorp = 0;

1967 bwritten = 0;

1968 bleft = len - 1; /* leave room for trailing \0 */

1969

1970 /*

1971 * Checks the size of the string. If it contains

1972 * any data, then we will append the new flags to

1973 * it.

1974 */

1975 actsize = strlen(buf);

1976 if (actsize > 0)

1977 buf += actsize;

1978

1979 /* Default flags if no flags passed */

1980 if (optp == NULL)

1981 optp = optnames;

1982

1983 if (bleft < 0) { /* degenerate case, 0-length buffer */

1984 *errorp = EINVAL;

1985 return(0);

1986 }

1987

1988 for (; flags && optp->o_opt; ++optp) {

1989 if ((flags & optp->o_opt) == 0)

1990 continue;

1991 optlen = strlen(optp->o_name);

1992 if (bwritten || actsize > 0) {

1993 if (bleft < 2) {

1994 *errorp = ENOSPC;

1995 break;

1996 }

1997 buf[bwritten++] = ',';

1998 buf[bwritten++] = ' ';

1999 bleft -= 2;

2000 }

2001 if (bleft < optlen) {

2002 *errorp = ENOSPC;

2003 break;

2004 }

2005 bcopy(optp->o_name, buf + bwritten, optlen);

2006 bwritten += optlen;

2007 bleft -= optlen;

2008 flags &= ~optp->o_opt;

2009 }

2010

2011 /*

2012 * Space already reserved for trailing \0

2013 */

2014 buf[bwritten] = 0;

2015 return (bwritten);

2016 }

2017

2018 /*

2019 * Build hash lists of net addresses and hang them off the mount point.

2020 * Called by ufs_mount() to set up the lists of export addresses.

2021 */

2022 static int

2023 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,

2024 const struct export_args *argp)

2025 {

2026 struct netcred *np;

2027 struct radix_node_head *rnh;

2028 int i;

2029 struct radix_node *rn;

2030 struct sockaddr *saddr, *smask = NULL;

2031 int error;

2032

2033 if (argp->ex_addrlen == 0) {

2034 if (mp->mnt_flag & MNT_DEFEXPORTED)

2035 return (EPERM);

2036 np = &nep->ne_defexported;

2037 np->netc_exflags = argp->ex_flags;

2038 np->netc_anon = argp->ex_anon;

2039 np->netc_anon.cr_ref = 1;

2040 mp->mnt_flag |= MNT_DEFEXPORTED;

2041 return (0);

2042 }

2043

2044 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN)

2045 return (EINVAL);

2046 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN)

2047 return (EINVAL);

2048

2049 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;

2050 np = (struct netcred *)kmalloc(i, M_NETCRED, M_WAITOK | M_ZERO);

2051 saddr = (struct sockaddr *) (np + 1);

2052 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))

2053 goto out;

2054 if (saddr->sa_len > argp->ex_addrlen)

2055 saddr->sa_len = argp->ex_addrlen;

2056 if (argp->ex_masklen) {

2057 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);

2058 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);

2059 if (error)

2060 goto out;

2061 if (smask->sa_len > argp->ex_masklen)

2062 smask->sa_len = argp->ex_masklen;

2063 }

2064 NE_LOCK(nep);

2065 if (nep->ne_maskhead == NULL) {

2066 if (!rn_inithead(&nep->ne_maskhead, NULL, 0)) {

2067 error = ENOBUFS;

2068 goto out;

2069 }

2070 }

2071 if ((rnh = vfs_create_addrlist_af(saddr->sa_family, nep)) == NULL) {

2072 error = ENOBUFS;

2073 goto out;

2074 }

2075 rn = rnh->rnh_addaddr(saddr, smask, rnh, np->netc_rnodes);

2076 NE_UNLOCK(nep);

2077 if (rn == NULL || np != (struct netcred *)rn) { /* already exists */

2078 error = EPERM;

2079 goto out;

2080 }

2081 np->netc_exflags = argp->ex_flags;

2082 np->netc_anon = argp->ex_anon;

2083 np->netc_anon.cr_ref = 1;

2084 return (0);

2085

2086 out:

2087 kfree(np, M_NETCRED);

2088 return (error);

2089 }

2090

2091 /*

2092 * Free netcred structures installed in the netexport

2093 */

2094 static void

2095 vfs_free_netcred(struct radix_node *rn)

2096 {

2097 struct netcred *np;

2098

2099 np = (struct netcred *)rn;

2100 kfree(np, M_NETCRED);

2101 }

2102

2103 static struct radix_node_head *

2104 vfs_create_addrlist_af(int af, struct netexport *nep)

2105 {

2106 struct radix_node_head *rnh = NULL;

2107 #if defined(INET) || defined(INET6)

2108 struct radix_node_head *maskhead = nep->ne_maskhead;

2109 int off;

2110 #endif

2111

2112 NE_ASSERT_LOCKED(nep);

2113 #if defined(INET) || defined(INET6)

2114 KKASSERT(maskhead != NULL);

2115 #endif

2116 switch (af) {

2117 #ifdef INET

2118 case AF_INET:

2119 if ((rnh = nep->ne_inethead) == NULL) {

2120 off = offsetof(struct sockaddr_in, sin_addr);

2121 if (!rn_inithead(&rnh, maskhead, off))

2122 return (NULL);

2123 nep->ne_inethead = rnh;

2124 }

2125 break;

2126 #endif

2127 #ifdef INET6

2128 case AF_INET6:

2129 if ((rnh = nep->ne_inet6head) == NULL) {

2130 off = offsetof(struct sockaddr_in6, sin6_addr);

2131 if (!rn_inithead(&rnh, maskhead, off))

2132 return (NULL);

2133 nep->ne_inet6head = rnh;

2134 }

2135 break;

2136 #endif

2137 }

2138 return (rnh);

2139 }

2140

2141 /*

2142 * Free the net address hash lists that are hanging off the mount points.

2143 */

2144 static void

2145 vfs_free_addrlist(struct netexport *nep)

2146 {

2147 NE_LOCK(nep);

2148 if (nep->ne_inethead != NULL) {

2149 rn_flush(nep->ne_inethead, vfs_free_netcred);

2150 rn_freehead(nep->ne_inethead);

2151 nep->ne_inethead = NULL;

2152 }

2153 if (nep->ne_inet6head != NULL) {

2154 rn_flush(nep->ne_inet6head, vfs_free_netcred);

2155 rn_freehead(nep->ne_inet6head);

2156 nep->ne_inet6head = NULL;

2157 }

2158 if (nep->ne_maskhead != NULL) {

2159 rn_flush(nep->ne_maskhead, rn_freemask);

2160 rn_freehead(nep->ne_maskhead);

2161 nep->ne_maskhead = NULL;

2162 }

2163 NE_UNLOCK(nep);

2164 }

2165

2166 int

2167 vfs_export(struct mount *mp, struct netexport *nep,

2168 const struct export_args *argp)

2169 {

2170 int error;

2171

2172 if (argp->ex_flags & MNT_DELEXPORT) {

2173 if (mp->mnt_flag & MNT_EXPUBLIC) {

2174 vfs_setpublicfs(NULL, NULL, NULL);

2175 mp->mnt_flag &= ~MNT_EXPUBLIC;

2176 }

2177 vfs_free_addrlist(nep);

2178 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);

2179 }

2180 if (argp->ex_flags & MNT_EXPORTED) {

2181 if (argp->ex_flags & MNT_EXPUBLIC) {

2182 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)

2183 return (error);

2184 mp->mnt_flag |= MNT_EXPUBLIC;

2185 }

2186 if ((error = vfs_hang_addrlist(mp, nep, argp)))

2187 return (error);

2188 mp->mnt_flag |= MNT_EXPORTED;

2189 }

2190 return (0);

2191 }

2192

2193

2194 /*

2195 * Set the publicly exported filesystem (WebNFS). Currently, only

2196 * one public filesystem is possible in the spec (RFC 2054 and 2055)

2197 */

2198 int

2199 vfs_setpublicfs(struct mount *mp, struct netexport *nep,

2200 const struct export_args *argp)

2201 {

2202 int error;

2203 struct vnode *rvp;

2204 char *cp;

2205

2206 /*

2207 * mp == NULL -> invalidate the current info, the FS is

2208 * no longer exported. May be called from either vfs_export

2209 * or unmount, so check if it hasn't already been done.

2210 */

2211 if (mp == NULL) {

2212 if (nfs_pub.np_valid) {

2213 nfs_pub.np_valid = 0;

2214 if (nfs_pub.np_index != NULL) {

2215 kfree(nfs_pub.np_index, M_TEMP);

2216 nfs_pub.np_index = NULL;

2217 }

2218 }

2219 return (0);

2220 }

2221

2222 /*

2223 * Only one allowed at a time.

2224 */

2225 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)

2226 return (EBUSY);

2227

2228 /*

2229 * Get real filehandle for root of exported FS.

2230 */

2231 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));

2232 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;

2233

2234 if ((error = VFS_ROOT(mp, &rvp)))

2235 return (error);

2236

2237 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))

2238 return (error);

2239

2240 vput(rvp);

2241

2242 /*

2243 * If an indexfile was specified, pull it in.

2244 */

2245 if (argp->ex_indexfile != NULL) {

2246 int namelen;

2247

2248 error = vn_get_namelen(rvp, &namelen);

2249 if (error)

2250 return (error);

2251 nfs_pub.np_index = kmalloc(namelen, M_TEMP, M_WAITOK);

2252 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,

2253 namelen, NULL);

2254 if (!error) {

2255 /*

2256 * Check for illegal filenames.

2257 */

2258 for (cp = nfs_pub.np_index; *cp; cp++) {

2259 if (*cp == '/') {

2260 error = EINVAL;

2261 break;

2262 }

2263 }

2264 }

2265 if (error) {

2266 kfree(nfs_pub.np_index, M_TEMP);

2267 return (error);

2268 }

2269 }

2270

2271 nfs_pub.np_mount = mp;

2272 nfs_pub.np_valid = 1;

2273 return (0);

2274 }

2275

2276 struct netcred *

2277 vfs_export_lookup(struct mount *mp, struct netexport *nep,

2278 struct sockaddr *nam)

2279 {

2280 struct netcred *np;

2281 struct radix_node_head *rnh;

2282 struct sockaddr *saddr;

2283

2284 np = NULL;

2285 if (mp->mnt_flag & MNT_EXPORTED) {

2286 /*

2287 * Lookup in the export list first.

2288 */

2289 NE_LOCK(nep);

2290 if (nam != NULL) {

2291 saddr = nam;

2292 switch (saddr->sa_family) {

2293 #ifdef INET

2294 case AF_INET:

2295 rnh = nep->ne_inethead;

2296 break;

2297 #endif

2298 #ifdef INET6

2299 case AF_INET6:

2300 rnh = nep->ne_inet6head;

2301 break;

2302 #endif

2303 default:

2304 rnh = NULL;

2305 }

2306 if (rnh != NULL) {

2307 np = (struct netcred *)

2308 rnh->rnh_matchaddr(saddr, rnh);

2309 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)

2310 np = NULL;

2311 }

2312 }

2313 NE_UNLOCK(nep);

2314 /*

2315 * If no address match, use the default if it exists.

2316 */

2317 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)

2318 np = &nep->ne_defexported;

2319 }

2320 return (np);

2321 }

2322

2323 /*

2324 * perform msync on all vnodes under a mount point. The mount point must

2325 * be locked. This code is also responsible for lazy-freeing unreferenced

2326 * vnodes whos VM objects no longer contain pages.

2327 *

2328 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.

2329 *

2330 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked,

2331 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it

2332 * way up in this high level function.

2333 */

2334 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data);

2335 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data);

2336

2337 void

2338 vfs_msync(struct mount *mp, int flags)

2339 {

2340 int vmsc_flags;

2341

2342 /*

2343 * tmpfs sets this flag to prevent msync(), sync, and the

2344 * filesystem periodic syncer from trying to flush VM pages

2345 * to swap. Only pure memory pressure flushes tmpfs VM pages

2346 * to swap.

2347 */

2348 if (mp->mnt_kern_flag & MNTK_NOMSYNC)

2349 return;

2350

2351 /*

2352 * Ok, scan the vnodes for work. If the filesystem is using the

2353 * syncer thread feature we can use vsyncscan() instead of

2354 * vmntvnodescan(), which is much faster.

2355 */

2356 vmsc_flags = VMSC_GETVP;

2357 if (flags != MNT_WAIT)

2358 vmsc_flags |= VMSC_NOWAIT;

2359

2360 if (mp->mnt_kern_flag & MNTK_THR_SYNC) {

2361 vsyncscan(mp, vmsc_flags, vfs_msync_scan2,

2362 (void *)(intptr_t)flags);

2363 } else {

2364 vmntvnodescan(mp, vmsc_flags,

2365 vfs_msync_scan1, vfs_msync_scan2,

2366 (void *)(intptr_t)flags);

2367 }

2368 }

2369

2370 /*

2371 * scan1 is a fast pre-check. There could be hundreds of thousands of

2372 * vnodes, we cannot afford to do anything heavy weight until we have a

2373 * fairly good indication that there is work to do.

2374 *

2375 * The new namecache holds the vnode for each v_namecache association

2376 * so allow these refs.

2377 */

2378 static

2379 int

2380 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data)

2381 {

2382 int flags = (int)(intptr_t)data;

2383

2384 if ((vp->v_flag & VRECLAIMED) == 0) {

2385 if (vp->v_auxrefs == vp->v_namecache_count &&

2386 VREFCNT(vp) <= 0 && vp->v_object) {

2387 return(0); /* call scan2 */

2388 }

2389 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&

2390 (vp->v_flag & VOBJDIRTY) &&

2391 (flags == MNT_WAIT || vn_islocked(vp) == 0)) {

2392 return(0); /* call scan2 */

2393 }

2394 }

2395

2396 /*

2397 * do not call scan2, continue the loop

2398 */

2399 return(-1);

2400 }

2401

2402 /*

2403 * This callback is handed a locked vnode.

2404 */

2405 static

2406 int

2407 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data)

2408 {

2409 vm_object_t obj;

2410 int flags = (int)(intptr_t)data;

2411 int opcflags;

2412

2413 if (vp->v_flag & VRECLAIMED)

2414 return(0);

2415

2416 if ((mp->mnt_flag & MNT_RDONLY) == 0 && (vp->v_flag & VOBJDIRTY)) {

2417 if ((obj = vp->v_object) != NULL) {

2418 if (flags == MNT_WAIT) {

2419 /*

2420 * VFS_MSYNC is called with MNT_WAIT when

2421 * unmounting.

2422 */

2423 opcflags = OBJPC_SYNC;

2424 } else if (vp->v_writecount || obj->ref_count) {

2425 /*

2426 * VFS_MSYNC is otherwise called via the

2427 * periodic filesystem sync or the 'sync'

2428 * command. Honor MADV_NOSYNC / MAP_NOSYNC

2429 * if the file is open for writing or memory

2430 * mapped. Pages flagged PG_NOSYNC will not

2431 * be automatically flushed at this time.

2432 *

2433 * The obj->ref_count test is not perfect

2434 * since temporary refs may be present, but

2435 * the periodic filesystem sync will ultimately

2436 * catch it if the file is not open and not

2437 * mapped.

2438 */

2439 opcflags = OBJPC_NOSYNC;

2440 } else {

2441 /*

2442 * If the file is no longer open for writing

2443 * and also no longer mapped, do not honor

2444 * MAP_NOSYNC. That is, fully synchronize

2445 * the file.

2446 *

2447 * This still occurs on the periodic fs sync,

2448 * so frontend programs which turn the file

2449 * over quickly enough can still avoid the

2450 * sync, but ultimately we do want to flush

2451 * even MADV_NOSYNC pages once it is no longer

2452 * mapped or open for writing.

2453 */

2454 opcflags = 0;

2455 }

2456 vm_object_page_clean(obj, 0, 0, opcflags);

2457 }

2458 }

2459 return(0);

2460 }

2461

2462 /*

2463 * Wake up anyone interested in vp because it is being revoked.

2464 */

2465 void

2466 vn_gone(struct vnode *vp)

2467 {

2468 lwkt_gettoken(&vp->v_token);

2469 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, NOTE_REVOKE);

2470 lwkt_reltoken(&vp->v_token);

2471 }

2472

2473 /*

2474 * extract the cdev_t from a VBLK or VCHR. The vnode must have been opened

2475 * (or v_rdev might be NULL).

2476 */

2477 cdev_t

2478 vn_todev(struct vnode *vp)

2479 {

2480 if (vp->v_type != VBLK && vp->v_type != VCHR)

2481 return (NULL);

2482 KKASSERT(vp->v_rdev != NULL);

2483 return (vp->v_rdev);

2484 }

2485

2486 /*

2487 * Check if vnode represents a disk device. The vnode does not need to be

2488 * opened.

2489 *

2490 * MPALMOSTSAFE

2491 */

2492 int

2493 vn_isdisk(struct vnode *vp, int *errp)

2494 {

2495 cdev_t dev;

2496

2497 if (vp->v_type != VCHR) {

2498 if (errp != NULL)

2499 *errp = ENOTBLK;

2500 return (0);

2501 }

2502

2503 dev = vp->v_rdev;

2504

2505 if (dev == NULL) {

2506 if (errp != NULL)

2507 *errp = ENXIO;

2508 return (0);

2509 }

2510 if (dev_is_good(dev) == 0) {

2511 if (errp != NULL)

2512 *errp = ENXIO;

2513 return (0);

2514 }

2515 if ((dev_dflags(dev) & D_DISK) == 0) {

2516 if (errp != NULL)

2517 *errp = ENOTBLK;

2518 return (0);

2519 }

2520 if (errp != NULL)

2521 *errp = 0;

2522 return (1);

2523 }

2524

2525 int

2526 vn_get_namelen(struct vnode *vp, int *namelen)

2527 {

2528 int error;

2529 register_t retval[2];

2530

2531 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval);

2532 if (error)

2533 return (error);

2534 *namelen = (int)retval[0];

2535 return (0);

2536 }

2537

2538 int

2539 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type,

2540 uint16_t d_namlen, const char *d_name)

2541 {

2542 struct dirent *dp;

2543 size_t len;

2544

2545 len = _DIRENT_RECLEN(d_namlen);

2546 if (len > uio->uio_resid)

2547 return(1);

2548

2549 dp = kmalloc(len, M_TEMP, M_WAITOK | M_ZERO);

2550

2551 dp->d_ino = d_ino;

2552 dp->d_namlen = d_namlen;

2553 dp->d_type = d_type;

2554 bcopy(d_name, dp->d_name, d_namlen);

2555

2556 *error = uiomove((caddr_t)dp, len, uio);

2557

2558 kfree(dp, M_TEMP);

2559

2560 return(0);

2561 }

2562

2563 void

2564 vn_mark_atime(struct vnode *vp, struct thread *td)

2565 {

2566 struct proc *p = td->td_proc;

2567 struct ucred *cred = p ? p->p_ucred : proc0.p_ucred;

2568

2569 if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {

2570 VOP_MARKATIME(vp, cred);

2571 }

2572 }

2573

2574 /*

2575 * Calculate the number of entries in an inode-related chained hash table.

2576 * With today's memory sizes, maxvnodes can wind up being a very large

2577 * number. There is no reason to waste memory, so tolerate some stacking.

2578 */

2579 int

2580 vfs_inodehashsize(void)

2581 {

2582 int hsize;

2583

2584 hsize = 32;

2585 while (hsize < maxvnodes)

2586 hsize <<= 1;

2587 while (hsize > maxvnodes * 2)

2588 hsize >>= 1; /* nominal 2x stacking */

2589

2590 if (maxvnodes > 1024 * 1024)

2591 hsize >>= 1; /* nominal 8x stacking */

2592

2593 if (maxvnodes > 128 * 1024)

2594 hsize >>= 1; /* nominal 4x stacking */

2595

2596 if (hsize < 16)

2597 hsize = 16;

2598

2599 return hsize;

2600 }

2601

2602 union _qcvt {

2603 quad_t qcvt;

2604 int32_t val[2];

2605 };

2606

2607 #define SETHIGH(q, h) { \

2608 union _qcvt tmp; \

2609 tmp.qcvt = (q); \

2610 tmp.val[_QUAD_HIGHWORD] = (h); \

2611 (q) = tmp.qcvt; \

2612 }

2613 #define SETLOW(q, l) { \

2614 union _qcvt tmp; \

2615 tmp.qcvt = (q); \

2616 tmp.val[_QUAD_LOWWORD] = (l); \

2617 (q) = tmp.qcvt; \

2618 }

2619

2620 u_quad_t

2621 init_va_filerev(void)

2622 {

2623 struct timeval tv;

2624 u_quad_t ret = 0;

2625

2626 getmicrouptime(&tv);

2627 SETHIGH(ret, tv.tv_sec);

2628 SETLOW(ret, tv.tv_usec * 4294);

2629

2630 return ret;

2631 }

2632

2633 /*

2634 * Set default timestamp_precision. If hz is reasonably high we go for

2635 * performance and limit vfs timestamps to microseconds with tick resolution.

2636 * If hz is too low, however, we lose a bit of performance to get a more

2637 * precise timestamp, because the mtime/ctime granularity might just be too

2638 * rough otherwise (for make and Makefile's, for example).

2639 */

2640 static void

2641 vfs_ts_prec_init(void *dummy)

2642 {

2643 if (timestamp_precision < 0) {

2644 if (hz >= 100)

2645 timestamp_precision = TSP_USEC;

2646 else

2647 timestamp_precision = TSP_USEC_PRECISE;

2648 }

2649 }

2650 SYSINIT(vfs_ts_prec_init, SI_SUB_VFS, SI_ORDER_ANY, vfs_ts_prec_init, NULL);