FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_subr.c
1 /* $NetBSD: vfs_subr.c,v 1.243.2.7 2007/08/26 18:44:17 bouyer Exp $ */
2
3 /*-
4 * Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 * This code is derived from software contributed to The NetBSD Foundation
11 * by Charles M. Hannum.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by the NetBSD
24 * Foundation, Inc. and its contributors.
25 * 4. Neither the name of The NetBSD Foundation nor the names of its
26 * contributors may be used to endorse or promote products derived
27 * from this software without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 * POSSIBILITY OF SUCH DAMAGE.
40 */
41
42 /*
43 * Copyright (c) 1989, 1993
44 * The Regents of the University of California. All rights reserved.
45 * (c) UNIX System Laboratories, Inc.
46 * All or some portions of this file are derived from material licensed
47 * to the University of California by American Telephone and Telegraph
48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
49 * the permission of UNIX System Laboratories, Inc.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions
53 * are met:
54 * 1. Redistributions of source code must retain the above copyright
55 * notice, this list of conditions and the following disclaimer.
56 * 2. Redistributions in binary form must reproduce the above copyright
57 * notice, this list of conditions and the following disclaimer in the
58 * documentation and/or other materials provided with the distribution.
59 * 3. Neither the name of the University nor the names of its contributors
60 * may be used to endorse or promote products derived from this software
61 * without specific prior written permission.
62 *
63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
73 * SUCH DAMAGE.
74 *
75 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
76 */
77
78 /*
79 * External virtual filesystem routines
80 */
81
82 #include <sys/cdefs.h>
83 __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.243.2.7 2007/08/26 18:44:17 bouyer Exp $");
84
85 #include "opt_inet.h"
86 #include "opt_ddb.h"
87 #include "opt_compat_netbsd.h"
88 #include "opt_compat_43.h"
89
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/proc.h>
93 #include <sys/kernel.h>
94 #include <sys/mount.h>
95 #include <sys/time.h>
96 #include <sys/event.h>
97 #include <sys/fcntl.h>
98 #include <sys/vnode.h>
99 #include <sys/stat.h>
100 #include <sys/namei.h>
101 #include <sys/ucred.h>
102 #include <sys/buf.h>
103 #include <sys/errno.h>
104 #include <sys/malloc.h>
105 #include <sys/domain.h>
106 #include <sys/mbuf.h>
107 #include <sys/sa.h>
108 #include <sys/syscallargs.h>
109 #include <sys/device.h>
110 #include <sys/extattr.h>
111 #include <sys/dirent.h>
112 #include <sys/filedesc.h>
113
114 #include <miscfs/specfs/specdev.h>
115 #include <miscfs/genfs/genfs.h>
116 #include <miscfs/syncfs/syncfs.h>
117
118 #include <netinet/in.h>
119
120 #include <uvm/uvm.h>
121 #include <uvm/uvm_ddb.h>
122
123 #include <netinet/in.h>
124
125 #include <sys/sysctl.h>
126
127 const enum vtype iftovt_tab[16] = {
128 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
129 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
130 };
131 const int vttoif_tab[9] = {
132 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
133 S_IFSOCK, S_IFIFO, S_IFMT,
134 };
135
136 int doforce = 1; /* 1 => permit forcible unmounting */
137 int prtactive = 0; /* 1 => print out reclaim of active vnodes */
138
139 extern int dovfsusermount; /* 1 => permit any user to mount filesystems */
140
141 /*
142 * Insq/Remq for the vnode usage lists.
143 */
144 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
145 #define bufremvn(bp) { \
146 LIST_REMOVE(bp, b_vnbufs); \
147 (bp)->b_vnbufs.le_next = NOLIST; \
148 }
149 /* TAILQ_HEAD(freelst, vnode) vnode_free_list = vnode free list (in vnode.h) */
150 struct freelst vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
151 struct freelst vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
152
153 struct mntlist mountlist = /* mounted filesystem list */
154 CIRCLEQ_HEAD_INITIALIZER(mountlist);
155 struct vfs_list_head vfs_list = /* vfs list */
156 LIST_HEAD_INITIALIZER(vfs_list);
157
158 struct nfs_public nfs_pub; /* publicly exported FS */
159
160 struct simplelock mountlist_slock = SIMPLELOCK_INITIALIZER;
161 static struct simplelock mntid_slock = SIMPLELOCK_INITIALIZER;
162 struct simplelock mntvnode_slock = SIMPLELOCK_INITIALIZER;
163 struct simplelock vnode_free_list_slock = SIMPLELOCK_INITIALIZER;
164 struct simplelock spechash_slock = SIMPLELOCK_INITIALIZER;
165
166 /* XXX - gross; single global lock to protect v_numoutput */
167 struct simplelock global_v_numoutput_slock = SIMPLELOCK_INITIALIZER;
168
169 /*
170 * These define the root filesystem and device.
171 */
172 struct mount *rootfs;
173 struct vnode *rootvnode;
174 struct device *root_device; /* root device */
175
176 POOL_INIT(vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodepl",
177 &pool_allocator_nointr);
178
179 MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
180
181 /*
182 * Local declarations.
183 */
184 void insmntque(struct vnode *, struct mount *);
185 int getdevvp(dev_t, struct vnode **, enum vtype);
186
187 void vclean(struct vnode *, int, struct proc *);
188
189 static int vfs_hang_addrlist(struct mount *, struct netexport *,
190 struct export_args *);
191 static int vfs_free_netcred(struct radix_node *, void *);
192 static void vfs_free_addrlist(struct netexport *);
193 static struct vnode *getcleanvnode(struct proc *);
194
195 #ifdef DEBUG
196 void printlockedvnodes(void);
197 #endif
198
199 /*
200 * Initialize the vnode management data structures.
201 */
202 void
203 vntblinit()
204 {
205
206 /*
207 * Initialize the filesystem syncer.
208 */
209 vn_initialize_syncerd();
210 }
211
212 int
213 vfs_drainvnodes(long target, struct proc *p)
214 {
215
216 simple_lock(&vnode_free_list_slock);
217 while (numvnodes > target) {
218 struct vnode *vp;
219
220 vp = getcleanvnode(p);
221 if (vp == NULL)
222 return EBUSY; /* give up */
223 pool_put(&vnode_pool, vp);
224 simple_lock(&vnode_free_list_slock);
225 numvnodes--;
226 }
227 simple_unlock(&vnode_free_list_slock);
228
229 return 0;
230 }
231
232 /*
233 * grab a vnode from freelist and clean it.
234 */
235 struct vnode *
236 getcleanvnode(p)
237 struct proc *p;
238 {
239 struct vnode *vp;
240 struct mount *mp;
241 struct freelst *listhd;
242
243 LOCK_ASSERT(simple_lock_held(&vnode_free_list_slock));
244
245 listhd = &vnode_free_list;
246 try_nextlist:
247 TAILQ_FOREACH(vp, listhd, v_freelist) {
248 if (!simple_lock_try(&vp->v_interlock))
249 continue;
250 /*
251 * as our lwp might hold the underlying vnode locked,
252 * don't try to reclaim the VLAYER vnode if it's locked.
253 */
254 if ((vp->v_flag & VXLOCK) == 0 &&
255 ((vp->v_flag & VLAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
256 if (vn_start_write(vp, &mp, V_NOWAIT) == 0)
257 break;
258 }
259 mp = NULL;
260 simple_unlock(&vp->v_interlock);
261 }
262
263 if (vp == NULLVP) {
264 if (listhd == &vnode_free_list) {
265 listhd = &vnode_hold_list;
266 goto try_nextlist;
267 }
268 simple_unlock(&vnode_free_list_slock);
269 return NULLVP;
270 }
271
272 if (vp->v_usecount)
273 panic("free vnode isn't, vp %p", vp);
274 TAILQ_REMOVE(listhd, vp, v_freelist);
275 /* see comment on why 0xdeadb is set at end of vgone (below) */
276 vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
277 simple_unlock(&vnode_free_list_slock);
278 vp->v_lease = NULL;
279
280 if (vp->v_type != VBAD)
281 vgonel(vp, p);
282 else
283 simple_unlock(&vp->v_interlock);
284 vn_finished_write(mp, 0);
285 #ifdef DIAGNOSTIC
286 if (vp->v_data || vp->v_uobj.uo_npages ||
287 TAILQ_FIRST(&vp->v_uobj.memq))
288 panic("cleaned vnode isn't, vp %p", vp);
289 if (vp->v_numoutput)
290 panic("clean vnode has pending I/O's, vp %p", vp);
291 #endif
292 KASSERT((vp->v_flag & VONWORKLST) == 0);
293
294 return vp;
295 }
296
297 /*
298 * Mark a mount point as busy. Used to synchronize access and to delay
299 * unmounting. Interlock is not released on failure.
300 */
301 int
302 vfs_busy(mp, flags, interlkp)
303 struct mount *mp;
304 int flags;
305 struct simplelock *interlkp;
306 {
307 int lkflags;
308
309 while (mp->mnt_iflag & IMNT_UNMOUNT) {
310 int gone, n;
311
312 if (flags & LK_NOWAIT)
313 return (ENOENT);
314 if ((flags & LK_RECURSEFAIL) && mp->mnt_unmounter != NULL
315 && mp->mnt_unmounter == curproc)
316 return (EDEADLK);
317 if (interlkp)
318 simple_unlock(interlkp);
319 /*
320 * Since all busy locks are shared except the exclusive
321 * lock granted when unmounting, the only place that a
322 * wakeup needs to be done is at the release of the
323 * exclusive lock at the end of dounmount.
324 */
325 simple_lock(&mp->mnt_slock);
326 mp->mnt_wcnt++;
327 ltsleep((caddr_t)mp, PVFS, "vfs_busy", 0, &mp->mnt_slock);
328 n = --mp->mnt_wcnt;
329 simple_unlock(&mp->mnt_slock);
330 gone = mp->mnt_iflag & IMNT_GONE;
331
332 if (n == 0)
333 wakeup(&mp->mnt_wcnt);
334 if (interlkp)
335 simple_lock(interlkp);
336 if (gone)
337 return (ENOENT);
338 }
339 lkflags = LK_SHARED;
340 if (interlkp)
341 lkflags |= LK_INTERLOCK;
342 if (lockmgr(&mp->mnt_lock, lkflags, interlkp))
343 panic("vfs_busy: unexpected lock failure");
344 return (0);
345 }
346
347 /*
348 * Free a busy filesystem.
349 */
350 void
351 vfs_unbusy(mp)
352 struct mount *mp;
353 {
354
355 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
356 }
357
358 /*
359 * Lookup a filesystem type, and if found allocate and initialize
360 * a mount structure for it.
361 *
362 * Devname is usually updated by mount(8) after booting.
363 */
364 int
365 vfs_rootmountalloc(fstypename, devname, mpp)
366 char *fstypename;
367 char *devname;
368 struct mount **mpp;
369 {
370 struct vfsops *vfsp = NULL;
371 struct mount *mp;
372
373 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
374 if (!strncmp(vfsp->vfs_name, fstypename, MFSNAMELEN))
375 break;
376
377 if (vfsp == NULL)
378 return (ENODEV);
379 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
380 memset((char *)mp, 0, (u_long)sizeof(struct mount));
381 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
382 simple_lock_init(&mp->mnt_slock);
383 (void)vfs_busy(mp, LK_NOWAIT, 0);
384 LIST_INIT(&mp->mnt_vnodelist);
385 mp->mnt_op = vfsp;
386 mp->mnt_flag = MNT_RDONLY;
387 mp->mnt_vnodecovered = NULLVP;
388 mp->mnt_leaf = mp;
389 vfsp->vfs_refcount++;
390 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name, MFSNAMELEN);
391 mp->mnt_stat.f_mntonname[0] = '/';
392 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
393 *mpp = mp;
394 return (0);
395 }
396
397 /*
398 * Lookup a mount point by filesystem identifier.
399 */
400 struct mount *
401 vfs_getvfs(fsid)
402 fsid_t *fsid;
403 {
404 struct mount *mp;
405
406 simple_lock(&mountlist_slock);
407 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
408 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
409 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
410 simple_unlock(&mountlist_slock);
411 return (mp);
412 }
413 }
414 simple_unlock(&mountlist_slock);
415 return ((struct mount *)0);
416 }
417
418 /*
419 * Get a new unique fsid
420 */
421 void
422 vfs_getnewfsid(mp)
423 struct mount *mp;
424 {
425 static u_short xxxfs_mntid;
426 fsid_t tfsid;
427 int mtype;
428
429 simple_lock(&mntid_slock);
430 mtype = makefstype(mp->mnt_op->vfs_name);
431 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
432 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
433 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
434 if (xxxfs_mntid == 0)
435 ++xxxfs_mntid;
436 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
437 tfsid.__fsid_val[1] = mtype;
438 if (!CIRCLEQ_EMPTY(&mountlist)) {
439 while (vfs_getvfs(&tfsid)) {
440 tfsid.__fsid_val[0]++;
441 xxxfs_mntid++;
442 }
443 }
444 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
445 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
446 simple_unlock(&mntid_slock);
447 }
448
449 /*
450 * Make a 'unique' number from a mount type name.
451 */
452 long
453 makefstype(type)
454 const char *type;
455 {
456 long rv;
457
458 for (rv = 0; *type; type++) {
459 rv <<= 2;
460 rv ^= *type;
461 }
462 return rv;
463 }
464
465
466 /*
467 * Set vnode attributes to VNOVAL
468 */
469 void
470 vattr_null(vap)
471 struct vattr *vap;
472 {
473
474 vap->va_type = VNON;
475
476 /*
477 * Assign individually so that it is safe even if size and
478 * sign of each member are varied.
479 */
480 vap->va_mode = VNOVAL;
481 vap->va_nlink = VNOVAL;
482 vap->va_uid = VNOVAL;
483 vap->va_gid = VNOVAL;
484 vap->va_fsid = VNOVAL;
485 vap->va_fileid = VNOVAL;
486 vap->va_size = VNOVAL;
487 vap->va_blocksize = VNOVAL;
488 vap->va_atime.tv_sec =
489 vap->va_mtime.tv_sec =
490 vap->va_ctime.tv_sec =
491 vap->va_birthtime.tv_sec = VNOVAL;
492 vap->va_atime.tv_nsec =
493 vap->va_mtime.tv_nsec =
494 vap->va_ctime.tv_nsec =
495 vap->va_birthtime.tv_nsec = VNOVAL;
496 vap->va_gen = VNOVAL;
497 vap->va_flags = VNOVAL;
498 vap->va_rdev = VNOVAL;
499 vap->va_bytes = VNOVAL;
500 vap->va_vaflags = 0;
501 }
502
503 /*
504 * Routines having to do with the management of the vnode table.
505 */
506 extern int (**dead_vnodeop_p)(void *);
507 long numvnodes;
508
509 /*
510 * Return the next vnode from the free list.
511 */
512 int
513 getnewvnode(tag, mp, vops, vpp)
514 enum vtagtype tag;
515 struct mount *mp;
516 int (**vops)(void *);
517 struct vnode **vpp;
518 {
519 extern struct uvm_pagerops uvm_vnodeops;
520 struct uvm_object *uobj;
521 struct proc *p = curproc; /* XXX */
522 static int toggle;
523 struct vnode *vp;
524 int error = 0, tryalloc;
525
526 try_again:
527 if (mp) {
528 /*
529 * Mark filesystem busy while we're creating a vnode.
530 * If unmount is in progress, this will wait; if the
531 * unmount succeeds (only if umount -f), this will
532 * return an error. If the unmount fails, we'll keep
533 * going afterwards.
534 * (This puts the per-mount vnode list logically under
535 * the protection of the vfs_busy lock).
536 */
537 error = vfs_busy(mp, LK_RECURSEFAIL, 0);
538 if (error && error != EDEADLK)
539 return error;
540 }
541
542 /*
543 * We must choose whether to allocate a new vnode or recycle an
544 * existing one. The criterion for allocating a new one is that
545 * the total number of vnodes is less than the number desired or
546 * there are no vnodes on either free list. Generally we only
547 * want to recycle vnodes that have no buffers associated with
548 * them, so we look first on the vnode_free_list. If it is empty,
549 * we next consider vnodes with referencing buffers on the
550 * vnode_hold_list. The toggle ensures that half the time we
551 * will use a buffer from the vnode_hold_list, and half the time
552 * we will allocate a new one unless the list has grown to twice
553 * the desired size. We are reticent to recycle vnodes from the
554 * vnode_hold_list because we will lose the identity of all its
555 * referencing buffers.
556 */
557
558 vp = NULL;
559
560 simple_lock(&vnode_free_list_slock);
561
562 toggle ^= 1;
563 if (numvnodes > 2 * desiredvnodes)
564 toggle = 0;
565
566 tryalloc = numvnodes < desiredvnodes ||
567 (TAILQ_FIRST(&vnode_free_list) == NULL &&
568 (TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
569
570 if (tryalloc &&
571 (vp = pool_get(&vnode_pool, PR_NOWAIT)) != NULL) {
572 numvnodes++;
573 simple_unlock(&vnode_free_list_slock);
574 memset(vp, 0, sizeof(*vp));
575 simple_lock_init(&vp->v_interlock);
576 uobj = &vp->v_uobj;
577 uobj->pgops = &uvm_vnodeops;
578 TAILQ_INIT(&uobj->memq);
579 /*
580 * done by memset() above.
581 * uobj->uo_npages = 0;
582 * LIST_INIT(&vp->v_nclist);
583 * LIST_INIT(&vp->v_dnclist);
584 */
585 } else {
586 vp = getcleanvnode(p);
587 /*
588 * Unless this is a bad time of the month, at most
589 * the first NCPUS items on the free list are
590 * locked, so this is close enough to being empty.
591 */
592 if (vp == NULLVP) {
593 if (mp && error != EDEADLK)
594 vfs_unbusy(mp);
595 if (tryalloc) {
596 printf("WARNING: unable to allocate new "
597 "vnode, retrying...\n");
598 (void) tsleep(&lbolt, PRIBIO, "newvn", hz);
599 goto try_again;
600 }
601 tablefull("vnode", "increase kern.maxvnodes or NVNODE");
602 *vpp = 0;
603 return (ENFILE);
604 }
605 vp->v_flag = 0;
606 vp->v_socket = NULL;
607 }
608 vp->v_type = VNON;
609 vp->v_vnlock = &vp->v_lock;
610 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
611 KASSERT(LIST_EMPTY(&vp->v_nclist));
612 KASSERT(LIST_EMPTY(&vp->v_dnclist));
613 vp->v_tag = tag;
614 vp->v_op = vops;
615 insmntque(vp, mp);
616 *vpp = vp;
617 vp->v_usecount = 1;
618 vp->v_data = 0;
619 simple_lock_init(&vp->v_interlock);
620
621 /*
622 * initialize uvm_object within vnode.
623 */
624
625 uobj = &vp->v_uobj;
626 KASSERT(uobj->pgops == &uvm_vnodeops);
627 KASSERT(uobj->uo_npages == 0);
628 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
629 vp->v_size = VSIZENOTSET;
630
631 if (mp && error != EDEADLK)
632 vfs_unbusy(mp);
633 return (0);
634 }
635
636 /*
637 * This is really just the reverse of getnewvnode(). Needed for
638 * VFS_VGET functions who may need to push back a vnode in case
639 * of a locking race.
640 */
641 void
642 ungetnewvnode(vp)
643 struct vnode *vp;
644 {
645 #ifdef DIAGNOSTIC
646 if (vp->v_usecount != 1)
647 panic("ungetnewvnode: busy vnode");
648 #endif
649 vp->v_usecount--;
650 insmntque(vp, NULL);
651 vp->v_type = VBAD;
652
653 simple_lock(&vp->v_interlock);
654 /*
655 * Insert at head of LRU list
656 */
657 simple_lock(&vnode_free_list_slock);
658 if (vp->v_holdcnt > 0)
659 TAILQ_INSERT_HEAD(&vnode_hold_list, vp, v_freelist);
660 else
661 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
662 simple_unlock(&vnode_free_list_slock);
663 simple_unlock(&vp->v_interlock);
664 }
665
666 /*
667 * Move a vnode from one mount queue to another.
668 */
669 void
670 insmntque(vp, mp)
671 struct vnode *vp;
672 struct mount *mp;
673 {
674
675 #ifdef DIAGNOSTIC
676 if ((mp != NULL) &&
677 (mp->mnt_iflag & IMNT_UNMOUNT) &&
678 !(mp->mnt_flag & MNT_SOFTDEP) &&
679 vp->v_tag != VT_VFS) {
680 panic("insmntque into dying filesystem");
681 }
682 #endif
683
684 simple_lock(&mntvnode_slock);
685 /*
686 * Delete from old mount point vnode list, if on one.
687 */
688 if (vp->v_mount != NULL)
689 LIST_REMOVE(vp, v_mntvnodes);
690 /*
691 * Insert into list of vnodes for the new mount point, if available.
692 */
693 if ((vp->v_mount = mp) != NULL)
694 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
695 simple_unlock(&mntvnode_slock);
696 }
697
698 /*
699 * Update outstanding I/O count and do wakeup if requested.
700 */
701 void
702 vwakeup(bp)
703 struct buf *bp;
704 {
705 struct vnode *vp;
706
707 if ((vp = bp->b_vp) != NULL) {
708 /* XXX global lock hack
709 * can't use v_interlock here since this is called
710 * in interrupt context from biodone().
711 */
712 simple_lock(&global_v_numoutput_slock);
713 if (--vp->v_numoutput < 0)
714 panic("vwakeup: neg numoutput, vp %p", vp);
715 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
716 vp->v_flag &= ~VBWAIT;
717 wakeup((caddr_t)&vp->v_numoutput);
718 }
719 simple_unlock(&global_v_numoutput_slock);
720 }
721 }
722
723 /*
724 * Flush out and invalidate all buffers associated with a vnode.
725 * Called with the underlying vnode locked, which should prevent new dirty
726 * buffers from being queued.
727 */
728 int
729 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
730 struct vnode *vp;
731 int flags;
732 struct ucred *cred;
733 struct proc *p;
734 int slpflag, slptimeo;
735 {
736 struct buf *bp, *nbp;
737 int s, error;
738 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
739 (flags & V_SAVE ? PGO_CLEANIT : 0);
740
741 /* XXXUBC this doesn't look at flags or slp* */
742 simple_lock(&vp->v_interlock);
743 error = VOP_PUTPAGES(vp, 0, 0, flushflags);
744 if (error) {
745 return error;
746 }
747
748 if (flags & V_SAVE) {
749 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p);
750 if (error)
751 return (error);
752 #ifdef DIAGNOSTIC
753 s = splbio();
754 if (vp->v_numoutput > 0 || !LIST_EMPTY(&vp->v_dirtyblkhd))
755 panic("vinvalbuf: dirty bufs, vp %p", vp);
756 splx(s);
757 #endif
758 }
759
760 s = splbio();
761
762 restart:
763 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
764 nbp = LIST_NEXT(bp, b_vnbufs);
765 simple_lock(&bp->b_interlock);
766 if (bp->b_flags & B_BUSY) {
767 bp->b_flags |= B_WANTED;
768 error = ltsleep((caddr_t)bp,
769 slpflag | (PRIBIO + 1) | PNORELOCK,
770 "vinvalbuf", slptimeo, &bp->b_interlock);
771 if (error) {
772 splx(s);
773 return (error);
774 }
775 goto restart;
776 }
777 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
778 simple_unlock(&bp->b_interlock);
779 brelse(bp);
780 }
781
782 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
783 nbp = LIST_NEXT(bp, b_vnbufs);
784 simple_lock(&bp->b_interlock);
785 if (bp->b_flags & B_BUSY) {
786 bp->b_flags |= B_WANTED;
787 error = ltsleep((caddr_t)bp,
788 slpflag | (PRIBIO + 1) | PNORELOCK,
789 "vinvalbuf", slptimeo, &bp->b_interlock);
790 if (error) {
791 splx(s);
792 return (error);
793 }
794 goto restart;
795 }
796 /*
797 * XXX Since there are no node locks for NFS, I believe
798 * there is a slight chance that a delayed write will
799 * occur while sleeping just above, so check for it.
800 */
801 if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
802 #ifdef DEBUG
803 printf("buffer still DELWRI\n");
804 #endif
805 bp->b_flags |= B_BUSY | B_VFLUSH;
806 simple_unlock(&bp->b_interlock);
807 VOP_BWRITE(bp);
808 goto restart;
809 }
810 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
811 simple_unlock(&bp->b_interlock);
812 brelse(bp);
813 }
814
815 #ifdef DIAGNOSTIC
816 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
817 panic("vinvalbuf: flush failed, vp %p", vp);
818 #endif
819
820 splx(s);
821
822 return (0);
823 }
824
825 /*
826 * Destroy any in core blocks past the truncation length.
827 * Called with the underlying vnode locked, which should prevent new dirty
828 * buffers from being queued.
829 */
830 int
831 vtruncbuf(vp, lbn, slpflag, slptimeo)
832 struct vnode *vp;
833 daddr_t lbn;
834 int slpflag, slptimeo;
835 {
836 struct buf *bp, *nbp;
837 int s, error;
838 voff_t off;
839
840 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
841 simple_lock(&vp->v_interlock);
842 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
843 if (error) {
844 return error;
845 }
846
847 s = splbio();
848
849 restart:
850 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
851 nbp = LIST_NEXT(bp, b_vnbufs);
852 if (bp->b_lblkno < lbn)
853 continue;
854 simple_lock(&bp->b_interlock);
855 if (bp->b_flags & B_BUSY) {
856 bp->b_flags |= B_WANTED;
857 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
858 "vtruncbuf", slptimeo, &bp->b_interlock);
859 if (error) {
860 splx(s);
861 return (error);
862 }
863 goto restart;
864 }
865 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
866 simple_unlock(&bp->b_interlock);
867 brelse(bp);
868 }
869
870 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
871 nbp = LIST_NEXT(bp, b_vnbufs);
872 if (bp->b_lblkno < lbn)
873 continue;
874 simple_lock(&bp->b_interlock);
875 if (bp->b_flags & B_BUSY) {
876 bp->b_flags |= B_WANTED;
877 error = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
878 "vtruncbuf", slptimeo, &bp->b_interlock);
879 if (error) {
880 splx(s);
881 return (error);
882 }
883 goto restart;
884 }
885 bp->b_flags |= B_BUSY | B_INVAL | B_VFLUSH;
886 simple_unlock(&bp->b_interlock);
887 brelse(bp);
888 }
889
890 splx(s);
891
892 return (0);
893 }
894
895 void
896 vflushbuf(vp, sync)
897 struct vnode *vp;
898 int sync;
899 {
900 struct buf *bp, *nbp;
901 int flags = PGO_CLEANIT | PGO_ALLPAGES | (sync ? PGO_SYNCIO : 0);
902 int s;
903
904 simple_lock(&vp->v_interlock);
905 (void) VOP_PUTPAGES(vp, 0, 0, flags);
906
907 loop:
908 s = splbio();
909 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
910 nbp = LIST_NEXT(bp, b_vnbufs);
911 simple_lock(&bp->b_interlock);
912 if ((bp->b_flags & B_BUSY)) {
913 simple_unlock(&bp->b_interlock);
914 continue;
915 }
916 if ((bp->b_flags & B_DELWRI) == 0)
917 panic("vflushbuf: not dirty, bp %p", bp);
918 bp->b_flags |= B_BUSY | B_VFLUSH;
919 simple_unlock(&bp->b_interlock);
920 splx(s);
921 /*
922 * Wait for I/O associated with indirect blocks to complete,
923 * since there is no way to quickly wait for them below.
924 */
925 if (bp->b_vp == vp || sync == 0)
926 (void) bawrite(bp);
927 else
928 (void) bwrite(bp);
929 goto loop;
930 }
931 if (sync == 0) {
932 splx(s);
933 return;
934 }
935 simple_lock(&global_v_numoutput_slock);
936 while (vp->v_numoutput) {
937 vp->v_flag |= VBWAIT;
938 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0,
939 &global_v_numoutput_slock);
940 }
941 simple_unlock(&global_v_numoutput_slock);
942 splx(s);
943 if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
944 vprint("vflushbuf: dirty", vp);
945 goto loop;
946 }
947 }
948
949 /*
950 * Associate a buffer with a vnode.
951 */
952 void
953 bgetvp(vp, bp)
954 struct vnode *vp;
955 struct buf *bp;
956 {
957 int s;
958
959 if (bp->b_vp)
960 panic("bgetvp: not free, bp %p", bp);
961 VHOLD(vp);
962 s = splbio();
963 bp->b_vp = vp;
964 if (vp->v_type == VBLK || vp->v_type == VCHR)
965 bp->b_dev = vp->v_rdev;
966 else
967 bp->b_dev = NODEV;
968 /*
969 * Insert onto list for new vnode.
970 */
971 bufinsvn(bp, &vp->v_cleanblkhd);
972 splx(s);
973 }
974
975 /*
976 * Disassociate a buffer from a vnode.
977 */
978 void
979 brelvp(bp)
980 struct buf *bp;
981 {
982 struct vnode *vp;
983 int s;
984
985 if (bp->b_vp == NULL)
986 panic("brelvp: vp NULL, bp %p", bp);
987
988 s = splbio();
989 vp = bp->b_vp;
990 /*
991 * Delete from old vnode list, if on one.
992 */
993 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
994 bufremvn(bp);
995
996 if (TAILQ_EMPTY(&vp->v_uobj.memq) && (vp->v_flag & VONWORKLST) &&
997 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
998 vp->v_flag &= ~(VWRITEMAPDIRTY|VONWORKLST);
999 LIST_REMOVE(vp, v_synclist);
1000 }
1001
1002 bp->b_vp = NULL;
1003 HOLDRELE(vp);
1004 splx(s);
1005 }
1006
1007 /*
1008 * Reassign a buffer from one vnode to another.
1009 * Used to assign file specific control information
1010 * (indirect blocks) to the vnode to which they belong.
1011 *
1012 * This function must be called at splbio().
1013 */
1014 void
1015 reassignbuf(bp, newvp)
1016 struct buf *bp;
1017 struct vnode *newvp;
1018 {
1019 struct buflists *listheadp;
1020 int delay;
1021
1022 /*
1023 * Delete from old vnode list, if on one.
1024 */
1025 if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
1026 bufremvn(bp);
1027 /*
1028 * If dirty, put on list of dirty buffers;
1029 * otherwise insert onto list of clean buffers.
1030 */
1031 if ((bp->b_flags & B_DELWRI) == 0) {
1032 listheadp = &newvp->v_cleanblkhd;
1033 if (TAILQ_EMPTY(&newvp->v_uobj.memq) &&
1034 (newvp->v_flag & VONWORKLST) &&
1035 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
1036 newvp->v_flag &= ~(VWRITEMAPDIRTY|VONWORKLST);
1037 LIST_REMOVE(newvp, v_synclist);
1038 }
1039 } else {
1040 listheadp = &newvp->v_dirtyblkhd;
1041 if ((newvp->v_flag & VONWORKLST) == 0) {
1042 switch (newvp->v_type) {
1043 case VDIR:
1044 delay = dirdelay;
1045 break;
1046 case VBLK:
1047 if (newvp->v_specmountpoint != NULL) {
1048 delay = metadelay;
1049 break;
1050 }
1051 /* fall through */
1052 default:
1053 delay = filedelay;
1054 break;
1055 }
1056 if (!newvp->v_mount ||
1057 (newvp->v_mount->mnt_flag & MNT_ASYNC) == 0)
1058 vn_syncer_add_to_worklist(newvp, delay);
1059 }
1060 }
1061 bufinsvn(bp, listheadp);
1062 }
1063
1064 /*
1065 * Create a vnode for a block device.
1066 * Used for root filesystem and swap areas.
1067 * Also used for memory file system special devices.
1068 */
1069 int
1070 bdevvp(dev, vpp)
1071 dev_t dev;
1072 struct vnode **vpp;
1073 {
1074
1075 return (getdevvp(dev, vpp, VBLK));
1076 }
1077
1078 /*
1079 * Create a vnode for a character device.
1080 * Used for kernfs and some console handling.
1081 */
1082 int
1083 cdevvp(dev, vpp)
1084 dev_t dev;
1085 struct vnode **vpp;
1086 {
1087
1088 return (getdevvp(dev, vpp, VCHR));
1089 }
1090
1091 /*
1092 * Create a vnode for a device.
1093 * Used by bdevvp (block device) for root file system etc.,
1094 * and by cdevvp (character device) for console and kernfs.
1095 */
1096 int
1097 getdevvp(dev, vpp, type)
1098 dev_t dev;
1099 struct vnode **vpp;
1100 enum vtype type;
1101 {
1102 struct vnode *vp;
1103 struct vnode *nvp;
1104 int error;
1105
1106 if (dev == NODEV) {
1107 *vpp = NULLVP;
1108 return (0);
1109 }
1110 error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1111 if (error) {
1112 *vpp = NULLVP;
1113 return (error);
1114 }
1115 vp = nvp;
1116 vp->v_type = type;
1117 if ((nvp = checkalias(vp, dev, NULL)) != 0) {
1118 vput(vp);
1119 vp = nvp;
1120 }
1121 *vpp = vp;
1122 return (0);
1123 }
1124
1125 /*
1126 * Check to see if the new vnode represents a special device
1127 * for which we already have a vnode (either because of
1128 * bdevvp() or because of a different vnode representing
1129 * the same block device). If such an alias exists, deallocate
1130 * the existing contents and return the aliased vnode. The
1131 * caller is responsible for filling it with its new contents.
1132 */
1133 struct vnode *
1134 checkalias(nvp, nvp_rdev, mp)
1135 struct vnode *nvp;
1136 dev_t nvp_rdev;
1137 struct mount *mp;
1138 {
1139 struct proc *p = curproc; /* XXX */
1140 struct vnode *vp;
1141 struct vnode **vpp;
1142
1143 if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1144 return (NULLVP);
1145
1146 vpp = &speclisth[SPECHASH(nvp_rdev)];
1147 loop:
1148 simple_lock(&spechash_slock);
1149 for (vp = *vpp; vp; vp = vp->v_specnext) {
1150 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
1151 continue;
1152 /*
1153 * Alias, but not in use, so flush it out.
1154 */
1155 simple_lock(&vp->v_interlock);
1156 simple_unlock(&spechash_slock);
1157 if (vp->v_usecount == 0) {
1158 vgonel(vp, p);
1159 goto loop;
1160 }
1161 /*
1162 * What we're interested to know here is if someone else has
1163 * removed this vnode from the device hash list while we were
1164 * waiting. This can only happen if vclean() did it, and
1165 * this requires the vnode to be locked. Therefore, we use
1166 * LK_SLEEPFAIL and retry.
1167 */
1168 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL))
1169 goto loop;
1170 simple_lock(&spechash_slock);
1171 break;
1172 }
1173 if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
1174 MALLOC(nvp->v_specinfo, struct specinfo *,
1175 sizeof(struct specinfo), M_VNODE, M_NOWAIT);
1176 /* XXX Erg. */
1177 if (nvp->v_specinfo == NULL) {
1178 simple_unlock(&spechash_slock);
1179 uvm_wait("checkalias");
1180 goto loop;
1181 }
1182
1183 nvp->v_rdev = nvp_rdev;
1184 nvp->v_hashchain = vpp;
1185 nvp->v_specnext = *vpp;
1186 nvp->v_specmountpoint = NULL;
1187 simple_unlock(&spechash_slock);
1188 nvp->v_speclockf = NULL;
1189 simple_lock_init(&nvp->v_spec_cow_slock);
1190 SLIST_INIT(&nvp->v_spec_cow_head);
1191 nvp->v_spec_cow_req = 0;
1192 nvp->v_spec_cow_count = 0;
1193
1194 *vpp = nvp;
1195 if (vp != NULLVP) {
1196 nvp->v_flag |= VALIASED;
1197 vp->v_flag |= VALIASED;
1198 vput(vp);
1199 }
1200 return (NULLVP);
1201 }
1202 simple_unlock(&spechash_slock);
1203 VOP_UNLOCK(vp, 0);
1204 simple_lock(&vp->v_interlock);
1205 vclean(vp, 0, p);
1206 vp->v_op = nvp->v_op;
1207 vp->v_tag = nvp->v_tag;
1208 vp->v_vnlock = &vp->v_lock;
1209 lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
1210 nvp->v_type = VNON;
1211 insmntque(vp, mp);
1212 return (vp);
1213 }
1214
1215 /*
1216 * Grab a particular vnode from the free list, increment its
1217 * reference count and lock it. If the vnode lock bit is set the
1218 * vnode is being eliminated in vgone. In that case, we can not
1219 * grab the vnode, so the process is awakened when the transition is
1220 * completed, and an error returned to indicate that the vnode is no
1221 * longer usable (possibly having been changed to a new file system type).
1222 */
1223 int
1224 vget(vp, flags)
1225 struct vnode *vp;
1226 int flags;
1227 {
1228 int error;
1229
1230 /*
1231 * If the vnode is in the process of being cleaned out for
1232 * another use, we wait for the cleaning to finish and then
1233 * return failure. Cleaning is determined by checking that
1234 * the VXLOCK flag is set.
1235 */
1236
1237 if ((flags & LK_INTERLOCK) == 0)
1238 simple_lock(&vp->v_interlock);
1239 if (vp->v_flag & VXLOCK) {
1240 if (flags & LK_NOWAIT) {
1241 simple_unlock(&vp->v_interlock);
1242 return EBUSY;
1243 }
1244 vp->v_flag |= VXWANT;
1245 ltsleep(vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock);
1246 return (ENOENT);
1247 }
1248 if (vp->v_usecount == 0) {
1249 simple_lock(&vnode_free_list_slock);
1250 if (vp->v_holdcnt > 0)
1251 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1252 else
1253 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1254 simple_unlock(&vnode_free_list_slock);
1255 }
1256 vp->v_usecount++;
1257 #ifdef DIAGNOSTIC
1258 if (vp->v_usecount == 0) {
1259 vprint("vget", vp);
1260 panic("vget: usecount overflow, vp %p", vp);
1261 }
1262 #endif
1263 if (flags & LK_TYPE_MASK) {
1264 if ((error = vn_lock(vp, flags | LK_INTERLOCK))) {
1265 /*
1266 * must expand vrele here because we do not want
1267 * to call VOP_INACTIVE if the reference count
1268 * drops back to zero since it was never really
1269 * active. We must remove it from the free list
1270 * before sleeping so that multiple processes do
1271 * not try to recycle it.
1272 */
1273 simple_lock(&vp->v_interlock);
1274 vp->v_usecount--;
1275 if (vp->v_usecount > 0) {
1276 simple_unlock(&vp->v_interlock);
1277 return (error);
1278 }
1279 /*
1280 * insert at tail of LRU list
1281 */
1282 simple_lock(&vnode_free_list_slock);
1283 if (vp->v_holdcnt > 0)
1284 TAILQ_INSERT_TAIL(&vnode_hold_list, vp,
1285 v_freelist);
1286 else
1287 TAILQ_INSERT_TAIL(&vnode_free_list, vp,
1288 v_freelist);
1289 simple_unlock(&vnode_free_list_slock);
1290 simple_unlock(&vp->v_interlock);
1291 }
1292 return (error);
1293 }
1294 simple_unlock(&vp->v_interlock);
1295 return (0);
1296 }
1297
1298 /*
1299 * vput(), just unlock and vrele()
1300 */
1301 void
1302 vput(vp)
1303 struct vnode *vp;
1304 {
1305 struct proc *p = curproc; /* XXX */
1306
1307 #ifdef DIAGNOSTIC
1308 if (vp == NULL)
1309 panic("vput: null vp");
1310 #endif
1311 simple_lock(&vp->v_interlock);
1312 vp->v_usecount--;
1313 if (vp->v_usecount > 0) {
1314 simple_unlock(&vp->v_interlock);
1315 VOP_UNLOCK(vp, 0);
1316 return;
1317 }
1318 #ifdef DIAGNOSTIC
1319 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1320 vprint("vput: bad ref count", vp);
1321 panic("vput: ref cnt");
1322 }
1323 #endif
1324 /*
1325 * Insert at tail of LRU list.
1326 */
1327 simple_lock(&vnode_free_list_slock);
1328 if (vp->v_holdcnt > 0)
1329 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1330 else
1331 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1332 simple_unlock(&vnode_free_list_slock);
1333 if (vp->v_flag & VEXECMAP) {
1334 uvmexp.execpages -= vp->v_uobj.uo_npages;
1335 uvmexp.filepages += vp->v_uobj.uo_npages;
1336 }
1337 vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP);
1338 simple_unlock(&vp->v_interlock);
1339 VOP_INACTIVE(vp, p);
1340 }
1341
1342 /*
1343 * Vnode release.
1344 * If count drops to zero, call inactive routine and return to freelist.
1345 */
1346 void
1347 vrele(vp)
1348 struct vnode *vp;
1349 {
1350 struct proc *p = curproc; /* XXX */
1351
1352 #ifdef DIAGNOSTIC
1353 if (vp == NULL)
1354 panic("vrele: null vp");
1355 #endif
1356 simple_lock(&vp->v_interlock);
1357 vp->v_usecount--;
1358 if (vp->v_usecount > 0) {
1359 simple_unlock(&vp->v_interlock);
1360 return;
1361 }
1362 #ifdef DIAGNOSTIC
1363 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1364 vprint("vrele: bad ref count", vp);
1365 panic("vrele: ref cnt vp %p", vp);
1366 }
1367 #endif
1368 /*
1369 * Insert at tail of LRU list.
1370 */
1371 simple_lock(&vnode_free_list_slock);
1372 if (vp->v_holdcnt > 0)
1373 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1374 else
1375 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1376 simple_unlock(&vnode_free_list_slock);
1377 if (vp->v_flag & VEXECMAP) {
1378 uvmexp.execpages -= vp->v_uobj.uo_npages;
1379 uvmexp.filepages += vp->v_uobj.uo_npages;
1380 }
1381 vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP);
1382 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
1383 VOP_INACTIVE(vp, p);
1384 }
1385
1386 #ifdef DIAGNOSTIC
1387 /*
1388 * Page or buffer structure gets a reference.
1389 */
1390 void
1391 vholdl(vp)
1392 struct vnode *vp;
1393 {
1394
1395 /*
1396 * If it is on the freelist and the hold count is currently
1397 * zero, move it to the hold list. The test of the back
1398 * pointer and the use reference count of zero is because
1399 * it will be removed from a free list by getnewvnode,
1400 * but will not have its reference count incremented until
1401 * after calling vgone. If the reference count were
1402 * incremented first, vgone would (incorrectly) try to
1403 * close the previous instance of the underlying object.
1404 * So, the back pointer is explicitly set to `0xdeadb' in
1405 * getnewvnode after removing it from a freelist to ensure
1406 * that we do not try to move it here.
1407 */
1408 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1409 vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1410 simple_lock(&vnode_free_list_slock);
1411 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1412 TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
1413 simple_unlock(&vnode_free_list_slock);
1414 }
1415 vp->v_holdcnt++;
1416 }
1417
1418 /*
1419 * Page or buffer structure frees a reference.
1420 */
1421 void
1422 holdrelel(vp)
1423 struct vnode *vp;
1424 {
1425
1426 if (vp->v_holdcnt <= 0)
1427 panic("holdrelel: holdcnt vp %p", vp);
1428 vp->v_holdcnt--;
1429
1430 /*
1431 * If it is on the holdlist and the hold count drops to
1432 * zero, move it to the free list. The test of the back
1433 * pointer and the use reference count of zero is because
1434 * it will be removed from a free list by getnewvnode,
1435 * but will not have its reference count incremented until
1436 * after calling vgone. If the reference count were
1437 * incremented first, vgone would (incorrectly) try to
1438 * close the previous instance of the underlying object.
1439 * So, the back pointer is explicitly set to `0xdeadb' in
1440 * getnewvnode after removing it from a freelist to ensure
1441 * that we do not try to move it here.
1442 */
1443
1444 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1445 vp->v_holdcnt == 0 && vp->v_usecount == 0) {
1446 simple_lock(&vnode_free_list_slock);
1447 TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
1448 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1449 simple_unlock(&vnode_free_list_slock);
1450 }
1451 }
1452
1453 /*
1454 * Vnode reference.
1455 */
1456 void
1457 vref(vp)
1458 struct vnode *vp;
1459 {
1460
1461 simple_lock(&vp->v_interlock);
1462 if (vp->v_usecount <= 0)
1463 panic("vref used where vget required, vp %p", vp);
1464 vp->v_usecount++;
1465 #ifdef DIAGNOSTIC
1466 if (vp->v_usecount == 0) {
1467 vprint("vref", vp);
1468 panic("vref: usecount overflow, vp %p", vp);
1469 }
1470 #endif
1471 simple_unlock(&vp->v_interlock);
1472 }
1473 #endif /* DIAGNOSTIC */
1474
1475 /*
1476 * Remove any vnodes in the vnode table belonging to mount point mp.
1477 *
1478 * If FORCECLOSE is not specified, there should not be any active ones,
1479 * return error if any are found (nb: this is a user error, not a
1480 * system error). If FORCECLOSE is specified, detach any active vnodes
1481 * that are found.
1482 *
1483 * If WRITECLOSE is set, only flush out regular file vnodes open for
1484 * writing.
1485 *
1486 * SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1487 */
1488 #ifdef DEBUG
1489 int busyprt = 0; /* print out busy vnodes */
1490 struct ctldebug debug1 = { "busyprt", &busyprt };
1491 #endif
1492
1493 int
1494 vflush(mp, skipvp, flags)
1495 struct mount *mp;
1496 struct vnode *skipvp;
1497 int flags;
1498 {
1499 struct proc *p = curproc; /* XXX */
1500 struct vnode *vp, *nvp;
1501 int busy = 0;
1502
1503 simple_lock(&mntvnode_slock);
1504 loop:
1505 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1506 if (vp->v_mount != mp)
1507 goto loop;
1508 nvp = LIST_NEXT(vp, v_mntvnodes);
1509 /*
1510 * Skip over a selected vnode.
1511 */
1512 if (vp == skipvp)
1513 continue;
1514 simple_lock(&vp->v_interlock);
1515 /*
1516 * Skip over a vnodes marked VSYSTEM.
1517 */
1518 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1519 simple_unlock(&vp->v_interlock);
1520 continue;
1521 }
1522 /*
1523 * If WRITECLOSE is set, only flush out regular file
1524 * vnodes open for writing.
1525 */
1526 if ((flags & WRITECLOSE) &&
1527 (vp->v_writecount == 0 || vp->v_type != VREG)) {
1528 simple_unlock(&vp->v_interlock);
1529 continue;
1530 }
1531 /*
1532 * With v_usecount == 0, all we need to do is clear
1533 * out the vnode data structures and we are done.
1534 */
1535 if (vp->v_usecount == 0) {
1536 simple_unlock(&mntvnode_slock);
1537 vgonel(vp, p);
1538 simple_lock(&mntvnode_slock);
1539 continue;
1540 }
1541 /*
1542 * If FORCECLOSE is set, forcibly close the vnode.
1543 * For block or character devices, revert to an
1544 * anonymous device. For all other files, just kill them.
1545 */
1546 if (flags & FORCECLOSE) {
1547 simple_unlock(&mntvnode_slock);
1548 if (vp->v_type != VBLK && vp->v_type != VCHR) {
1549 vgonel(vp, p);
1550 } else {
1551 vclean(vp, 0, p);
1552 vp->v_op = spec_vnodeop_p;
1553 insmntque(vp, (struct mount *)0);
1554 }
1555 simple_lock(&mntvnode_slock);
1556 continue;
1557 }
1558 #ifdef DEBUG
1559 if (busyprt)
1560 vprint("vflush: busy vnode", vp);
1561 #endif
1562 simple_unlock(&vp->v_interlock);
1563 busy++;
1564 }
1565 simple_unlock(&mntvnode_slock);
1566 if (busy)
1567 return (EBUSY);
1568 return (0);
1569 }
1570
1571 /*
1572 * Disassociate the underlying file system from a vnode.
1573 */
1574 void
1575 vclean(vp, flags, p)
1576 struct vnode *vp;
1577 int flags;
1578 struct proc *p;
1579 {
1580 struct mount *mp;
1581 int active;
1582
1583 LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1584
1585 /*
1586 * Check to see if the vnode is in use.
1587 * If so we have to reference it before we clean it out
1588 * so that its count cannot fall to zero and generate a
1589 * race against ourselves to recycle it.
1590 */
1591
1592 if ((active = vp->v_usecount) != 0) {
1593 vp->v_usecount++;
1594 #ifdef DIAGNOSTIC
1595 if (vp->v_usecount == 0) {
1596 vprint("vclean", vp);
1597 panic("vclean: usecount overflow");
1598 }
1599 #endif
1600 }
1601
1602 /*
1603 * Prevent the vnode from being recycled or
1604 * brought into use while we clean it out.
1605 */
1606 if (vp->v_flag & VXLOCK)
1607 panic("vclean: deadlock, vp %p", vp);
1608 vp->v_flag |= VXLOCK;
1609 if (vp->v_flag & VEXECMAP) {
1610 uvmexp.execpages -= vp->v_uobj.uo_npages;
1611 uvmexp.filepages += vp->v_uobj.uo_npages;
1612 }
1613 vp->v_flag &= ~(VTEXT|VEXECMAP);
1614
1615 /*
1616 * Even if the count is zero, the VOP_INACTIVE routine may still
1617 * have the object locked while it cleans it out. For
1618 * active vnodes, it ensures that no other activity can
1619 * occur while the underlying object is being cleaned out.
1620 *
1621 * We drain the lock to make sure we are the last one trying to
1622 * get it and immediately resurrect the lock. Future accesses
1623 * for locking this _vnode_ will be protected by VXLOCK. However,
1624 * upper layers might be using the _lock_ in case the file system
1625 * exported it and might access it while the vnode lingers in
1626 * deadfs.
1627 */
1628 VOP_LOCK(vp, LK_DRAIN | LK_RESURRECT | LK_INTERLOCK);
1629
1630 /*
1631 * Clean out any cached data associated with the vnode.
1632 * If special device, remove it from special device alias list.
1633 * if it is on one.
1634 */
1635 if (flags & DOCLOSE) {
1636 int error;
1637 struct vnode *vq, *vx;
1638
1639 vn_start_write(vp, &mp, V_WAIT | V_LOWER);
1640 error = vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1641 vn_finished_write(mp, V_LOWER);
1642 if (error)
1643 error = vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1644 KASSERT(error == 0);
1645 KASSERT((vp->v_flag & VONWORKLST) == 0);
1646
1647 if (active)
1648 VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
1649
1650 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1651 vp->v_specinfo != 0) {
1652 simple_lock(&spechash_slock);
1653 if (vp->v_hashchain != NULL) {
1654 if (*vp->v_hashchain == vp) {
1655 *vp->v_hashchain = vp->v_specnext;
1656 } else {
1657 for (vq = *vp->v_hashchain; vq;
1658 vq = vq->v_specnext) {
1659 if (vq->v_specnext != vp)
1660 continue;
1661 vq->v_specnext = vp->v_specnext;
1662 break;
1663 }
1664 if (vq == NULL)
1665 panic("missing bdev");
1666 }
1667 if (vp->v_flag & VALIASED) {
1668 vx = NULL;
1669 for (vq = *vp->v_hashchain; vq;
1670 vq = vq->v_specnext) {
1671 if (vq->v_rdev != vp->v_rdev ||
1672 vq->v_type != vp->v_type)
1673 continue;
1674 if (vx)
1675 break;
1676 vx = vq;
1677 }
1678 if (vx == NULL)
1679 panic("missing alias");
1680 if (vq == NULL)
1681 vx->v_flag &= ~VALIASED;
1682 vp->v_flag &= ~VALIASED;
1683 }
1684 }
1685 simple_unlock(&spechash_slock);
1686 FREE(vp->v_specinfo, M_VNODE);
1687 vp->v_specinfo = NULL;
1688 }
1689 }
1690 LOCK_ASSERT(!simple_lock_held(&vp->v_interlock));
1691
1692 /*
1693 * If purging an active vnode, it must be closed and
1694 * deactivated before being reclaimed. Note that the
1695 * VOP_INACTIVE will unlock the vnode.
1696 */
1697 if (active) {
1698 VOP_INACTIVE(vp, p);
1699 } else {
1700 /*
1701 * Any other processes trying to obtain this lock must first
1702 * wait for VXLOCK to clear, then call the new lock operation.
1703 */
1704 VOP_UNLOCK(vp, 0);
1705 }
1706 /*
1707 * Reclaim the vnode.
1708 */
1709 if (VOP_RECLAIM(vp, p))
1710 panic("vclean: cannot reclaim, vp %p", vp);
1711 if (active) {
1712 /*
1713 * Inline copy of vrele() since VOP_INACTIVE
1714 * has already been called.
1715 */
1716 simple_lock(&vp->v_interlock);
1717 if (--vp->v_usecount <= 0) {
1718 #ifdef DIAGNOSTIC
1719 if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1720 vprint("vclean: bad ref count", vp);
1721 panic("vclean: ref cnt");
1722 }
1723 #endif
1724 /*
1725 * Insert at tail of LRU list.
1726 */
1727
1728 simple_unlock(&vp->v_interlock);
1729 simple_lock(&vnode_free_list_slock);
1730 #ifdef DIAGNOSTIC
1731 if (vp->v_holdcnt > 0)
1732 panic("vclean: not clean, vp %p", vp);
1733 #endif
1734 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1735 simple_unlock(&vnode_free_list_slock);
1736 } else
1737 simple_unlock(&vp->v_interlock);
1738 }
1739
1740 KASSERT(vp->v_uobj.uo_npages == 0);
1741 cache_purge(vp);
1742
1743 /*
1744 * Done with purge, notify sleepers of the grim news.
1745 */
1746 vp->v_op = dead_vnodeop_p;
1747 vp->v_tag = VT_NON;
1748 vp->v_vnlock = NULL;
1749 simple_lock(&vp->v_interlock);
1750 VN_KNOTE(vp, NOTE_REVOKE); /* FreeBSD has this in vn_pollgone() */
1751 vp->v_flag &= ~(VXLOCK|VLOCKSWORK);
1752 if (vp->v_flag & VXWANT) {
1753 vp->v_flag &= ~VXWANT;
1754 simple_unlock(&vp->v_interlock);
1755 wakeup((caddr_t)vp);
1756 } else
1757 simple_unlock(&vp->v_interlock);
1758 }
1759
1760 /*
1761 * Recycle an unused vnode to the front of the free list.
1762 * Release the passed interlock if the vnode will be recycled.
1763 */
1764 int
1765 vrecycle(vp, inter_lkp, p)
1766 struct vnode *vp;
1767 struct simplelock *inter_lkp;
1768 struct proc *p;
1769 {
1770
1771 simple_lock(&vp->v_interlock);
1772 if (vp->v_usecount == 0) {
1773 if (inter_lkp)
1774 simple_unlock(inter_lkp);
1775 vgonel(vp, p);
1776 return (1);
1777 }
1778 simple_unlock(&vp->v_interlock);
1779 return (0);
1780 }
1781
1782 /*
1783 * Eliminate all activity associated with a vnode
1784 * in preparation for reuse.
1785 */
1786 void
1787 vgone(vp)
1788 struct vnode *vp;
1789 {
1790 struct proc *p = curproc; /* XXX */
1791
1792 simple_lock(&vp->v_interlock);
1793 vgonel(vp, p);
1794 }
1795
1796 /*
1797 * vgone, with the vp interlock held.
1798 */
1799 void
1800 vgonel(vp, p)
1801 struct vnode *vp;
1802 struct proc *p;
1803 {
1804
1805 LOCK_ASSERT(simple_lock_held(&vp->v_interlock));
1806
1807 /*
1808 * If a vgone (or vclean) is already in progress,
1809 * wait until it is done and return.
1810 */
1811
1812 if (vp->v_flag & VXLOCK) {
1813 vp->v_flag |= VXWANT;
1814 ltsleep(vp, PINOD | PNORELOCK, "vgone", 0, &vp->v_interlock);
1815 return;
1816 }
1817
1818 /*
1819 * Clean out the filesystem specific data.
1820 */
1821
1822 vclean(vp, DOCLOSE, p);
1823 KASSERT((vp->v_flag & VONWORKLST) == 0);
1824
1825 /*
1826 * Delete from old mount point vnode list, if on one.
1827 */
1828
1829 if (vp->v_mount != NULL)
1830 insmntque(vp, (struct mount *)0);
1831
1832 /*
1833 * The test of the back pointer and the reference count of
1834 * zero is because it will be removed from the free list by
1835 * getcleanvnode, but will not have its reference count
1836 * incremented until after calling vgone. If the reference
1837 * count were incremented first, vgone would (incorrectly)
1838 * try to close the previous instance of the underlying object.
1839 * So, the back pointer is explicitly set to `0xdeadb' in
1840 * getnewvnode after removing it from the freelist to ensure
1841 * that we do not try to move it here.
1842 */
1843
1844 vp->v_type = VBAD;
1845 if (vp->v_usecount == 0) {
1846 boolean_t dofree;
1847
1848 simple_lock(&vnode_free_list_slock);
1849 if (vp->v_holdcnt > 0)
1850 panic("vgonel: not clean, vp %p", vp);
1851 /*
1852 * if it isn't on the freelist, we're called by getcleanvnode
1853 * and vnode is being re-used. otherwise, we'll free it.
1854 */
1855 dofree = vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb;
1856 if (dofree) {
1857 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1858 numvnodes--;
1859 }
1860 simple_unlock(&vnode_free_list_slock);
1861 if (dofree)
1862 pool_put(&vnode_pool, vp);
1863 }
1864 }
1865
1866 /*
1867 * Lookup a vnode by device number.
1868 */
1869 int
1870 vfinddev(dev, type, vpp)
1871 dev_t dev;
1872 enum vtype type;
1873 struct vnode **vpp;
1874 {
1875 struct vnode *vp;
1876 int rc = 0;
1877
1878 simple_lock(&spechash_slock);
1879 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1880 if (dev != vp->v_rdev || type != vp->v_type)
1881 continue;
1882 *vpp = vp;
1883 rc = 1;
1884 break;
1885 }
1886 simple_unlock(&spechash_slock);
1887 return (rc);
1888 }
1889
1890 /*
1891 * Revoke all the vnodes corresponding to the specified minor number
1892 * range (endpoints inclusive) of the specified major.
1893 */
1894 void
1895 vdevgone(maj, minl, minh, type)
1896 int maj, minl, minh;
1897 enum vtype type;
1898 {
1899 struct vnode *vp;
1900 int mn;
1901
1902 for (mn = minl; mn <= minh; mn++)
1903 if (vfinddev(makedev(maj, mn), type, &vp))
1904 VOP_REVOKE(vp, REVOKEALL);
1905 }
1906
1907 /*
1908 * Calculate the total number of references to a special device.
1909 */
1910 int
1911 vcount(vp)
1912 struct vnode *vp;
1913 {
1914 struct vnode *vq, *vnext;
1915 int count;
1916
1917 loop:
1918 if ((vp->v_flag & VALIASED) == 0)
1919 return (vp->v_usecount);
1920 simple_lock(&spechash_slock);
1921 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1922 vnext = vq->v_specnext;
1923 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1924 continue;
1925 /*
1926 * Alias, but not in use, so flush it out.
1927 */
1928 if (vq->v_usecount == 0 && vq != vp &&
1929 (vq->v_flag & VXLOCK) == 0) {
1930 simple_unlock(&spechash_slock);
1931 vgone(vq);
1932 goto loop;
1933 }
1934 count += vq->v_usecount;
1935 }
1936 simple_unlock(&spechash_slock);
1937 return (count);
1938 }
1939
1940 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
1941 #define ARRAY_PRINT(idx, arr) \
1942 ((idx) > 0 && (idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
1943
1944 const char * const vnode_tags[] = { VNODE_TAGS };
1945 const char * const vnode_types[] = { VNODE_TYPES };
1946 const char vnode_flagbits[] = VNODE_FLAGBITS;
1947
1948 /*
1949 * Print out a description of a vnode.
1950 */
1951 void
1952 vprint(label, vp)
1953 char *label;
1954 struct vnode *vp;
1955 {
1956 char buf[96];
1957
1958 if (label != NULL)
1959 printf("%s: ", label);
1960 printf("tag %s(%d) type %s(%d), usecount %d, writecount %ld, "
1961 "refcount %ld,", ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1962 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1963 vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
1964 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf));
1965 if (buf[0] != '\0')
1966 printf(" flags (%s)", &buf[1]);
1967 if (vp->v_data == NULL) {
1968 printf("\n");
1969 } else {
1970 printf("\n\t");
1971 VOP_PRINT(vp);
1972 }
1973 }
1974
1975 #ifdef DEBUG
1976 /*
1977 * List all of the locked vnodes in the system.
1978 * Called when debugging the kernel.
1979 */
1980 void
1981 printlockedvnodes()
1982 {
1983 struct mount *mp, *nmp;
1984 struct vnode *vp;
1985
1986 printf("Locked vnodes\n");
1987 simple_lock(&mountlist_slock);
1988 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1989 mp = nmp) {
1990 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
1991 nmp = CIRCLEQ_NEXT(mp, mnt_list);
1992 continue;
1993 }
1994 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1995 if (VOP_ISLOCKED(vp))
1996 vprint(NULL, vp);
1997 }
1998 simple_lock(&mountlist_slock);
1999 nmp = CIRCLEQ_NEXT(mp, mnt_list);
2000 vfs_unbusy(mp);
2001 }
2002 simple_unlock(&mountlist_slock);
2003 }
2004 #endif
2005
2006 /*
2007 * sysctl helper routine for vfs.generic.conf lookups.
2008 */
2009 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
2010 static int
2011 sysctl_vfs_generic_conf(SYSCTLFN_ARGS)
2012 {
2013 struct vfsconf vfc;
2014 extern const char * const mountcompatnames[];
2015 extern int nmountcompatnames;
2016 struct sysctlnode node;
2017 struct vfsops *vfsp;
2018 u_int vfsnum;
2019
2020 if (namelen != 1)
2021 return (ENOTDIR);
2022 vfsnum = name[0];
2023 if (vfsnum >= nmountcompatnames ||
2024 mountcompatnames[vfsnum] == NULL)
2025 return (EOPNOTSUPP);
2026 vfsp = vfs_getopsbyname(mountcompatnames[vfsnum]);
2027 if (vfsp == NULL)
2028 return (EOPNOTSUPP);
2029
2030 vfc.vfc_vfsops = vfsp;
2031 strncpy(vfc.vfc_name, vfsp->vfs_name, MFSNAMELEN);
2032 vfc.vfc_typenum = vfsnum;
2033 vfc.vfc_refcount = vfsp->vfs_refcount;
2034 vfc.vfc_flags = 0;
2035 vfc.vfc_mountroot = vfsp->vfs_mountroot;
2036 vfc.vfc_next = NULL;
2037
2038 node = *rnode;
2039 node.sysctl_data = &vfc;
2040 return (sysctl_lookup(SYSCTLFN_CALL(&node)));
2041 }
2042 #endif
2043
2044 /*
2045 * sysctl helper routine to return list of supported fstypes
2046 */
2047 static int
2048 sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
2049 {
2050 char buf[MFSNAMELEN];
2051 char *where = oldp;
2052 struct vfsops *v;
2053 size_t needed, left, slen;
2054 int error, first;
2055
2056 if (newp != NULL)
2057 return (EPERM);
2058 if (namelen != 0)
2059 return (EINVAL);
2060
2061 first = 1;
2062 error = 0;
2063 needed = 0;
2064 left = *oldlenp;
2065
2066 LIST_FOREACH(v, &vfs_list, vfs_list) {
2067 if (where == NULL)
2068 needed += strlen(v->vfs_name) + 1;
2069 else {
2070 memset(buf, 0, sizeof(buf));
2071 if (first) {
2072 strncpy(buf, v->vfs_name, sizeof(buf));
2073 first = 0;
2074 } else {
2075 buf[0] = ' ';
2076 strncpy(buf + 1, v->vfs_name, sizeof(buf) - 1);
2077 }
2078 buf[sizeof(buf)-1] = '\0';
2079 slen = strlen(buf);
2080 if (left < slen + 1)
2081 break;
2082 /* +1 to copy out the trailing NUL byte */
2083 error = copyout(buf, where, slen + 1);
2084 if (error)
2085 break;
2086 where += slen;
2087 needed += slen;
2088 left -= slen;
2089 }
2090 }
2091 *oldlenp = needed;
2092 return (error);
2093 }
2094
2095 /*
2096 * Top level filesystem related information gathering.
2097 */
2098 SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
2099 {
2100 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
2101 extern int nmountcompatnames;
2102 #endif
2103
2104 sysctl_createv(clog, 0, NULL, NULL,
2105 CTLFLAG_PERMANENT,
2106 CTLTYPE_NODE, "vfs", NULL,
2107 NULL, 0, NULL, 0,
2108 CTL_VFS, CTL_EOL);
2109 sysctl_createv(clog, 0, NULL, NULL,
2110 CTLFLAG_PERMANENT,
2111 CTLTYPE_NODE, "generic",
2112 SYSCTL_DESCR("Non-specific vfs related information"),
2113 NULL, 0, NULL, 0,
2114 CTL_VFS, VFS_GENERIC, CTL_EOL);
2115
2116 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
2117 sysctl_createv(clog, 0, NULL, NULL,
2118 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
2119 CTLTYPE_INT, "maxtypenum",
2120 SYSCTL_DESCR("Highest valid filesystem type number"),
2121 NULL, nmountcompatnames, NULL, 0,
2122 CTL_VFS, VFS_GENERIC, VFS_MAXTYPENUM, CTL_EOL);
2123 #endif
2124 sysctl_createv(clog, 0, NULL, NULL,
2125 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2126 CTLTYPE_INT, "usermount",
2127 SYSCTL_DESCR("Whether unprivileged users may mount "
2128 "filesystems"),
2129 NULL, 0, &dovfsusermount, 0,
2130 CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
2131 sysctl_createv(clog, 0, NULL, NULL,
2132 CTLFLAG_PERMANENT,
2133 CTLTYPE_STRING, "fstypes",
2134 SYSCTL_DESCR("List of file systems present"),
2135 sysctl_vfs_generic_fstypes, 0, NULL, 0,
2136 CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
2137 #if defined(COMPAT_09) || defined(COMPAT_43) || defined(COMPAT_44)
2138 sysctl_createv(clog, 0, NULL, NULL,
2139 CTLFLAG_PERMANENT,
2140 CTLTYPE_STRUCT, "conf",
2141 SYSCTL_DESCR("Filesystem configuration information"),
2142 sysctl_vfs_generic_conf, 0, NULL,
2143 sizeof(struct vfsconf),
2144 CTL_VFS, VFS_GENERIC, VFS_CONF, CTL_EOL);
2145 #endif
2146 }
2147
2148
2149 int kinfo_vdebug = 1;
2150 int kinfo_vgetfailed;
2151 #define KINFO_VNODESLOP 10
2152 /*
2153 * Dump vnode list (via sysctl).
2154 * Copyout address of vnode followed by vnode.
2155 */
2156 /* ARGSUSED */
2157 int
2158 sysctl_kern_vnode(SYSCTLFN_ARGS)
2159 {
2160 char *where = oldp;
2161 size_t *sizep = oldlenp;
2162 struct mount *mp, *nmp;
2163 struct vnode *nvp, *vp;
2164 char *bp = where, *savebp;
2165 char *ewhere;
2166 int error;
2167
2168 if (namelen != 0)
2169 return (EOPNOTSUPP);
2170 if (newp != NULL)
2171 return (EPERM);
2172
2173 #define VPTRSZ sizeof(struct vnode *)
2174 #define VNODESZ sizeof(struct vnode)
2175 if (where == NULL) {
2176 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
2177 return (0);
2178 }
2179 ewhere = where + *sizep;
2180
2181 simple_lock(&mountlist_slock);
2182 for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
2183 mp = nmp) {
2184 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
2185 nmp = CIRCLEQ_NEXT(mp, mnt_list);
2186 continue;
2187 }
2188 savebp = bp;
2189 again:
2190 simple_lock(&mntvnode_slock);
2191 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2192 vp != NULL;
2193 vp = nvp) {
2194 /*
2195 * Check that the vp is still associated with
2196 * this filesystem. RACE: could have been
2197 * recycled onto the same filesystem.
2198 */
2199 if (vp->v_mount != mp) {
2200 simple_unlock(&mntvnode_slock);
2201 if (kinfo_vdebug)
2202 printf("kinfo: vp changed\n");
2203 bp = savebp;
2204 goto again;
2205 }
2206 nvp = LIST_NEXT(vp, v_mntvnodes);
2207 if (bp + VPTRSZ + VNODESZ > ewhere) {
2208 simple_unlock(&mntvnode_slock);
2209 *sizep = bp - where;
2210 return (ENOMEM);
2211 }
2212 simple_unlock(&mntvnode_slock);
2213 if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
2214 (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
2215 return (error);
2216 bp += VPTRSZ + VNODESZ;
2217 simple_lock(&mntvnode_slock);
2218 }
2219 simple_unlock(&mntvnode_slock);
2220 simple_lock(&mountlist_slock);
2221 nmp = CIRCLEQ_NEXT(mp, mnt_list);
2222 vfs_unbusy(mp);
2223 }
2224 simple_unlock(&mountlist_slock);
2225
2226 *sizep = bp - where;
2227 return (0);
2228 }
2229
2230 /*
2231 * Check to see if a filesystem is mounted on a block device.
2232 */
2233 int
2234 vfs_mountedon(vp)
2235 struct vnode *vp;
2236 {
2237 struct vnode *vq;
2238 int error = 0;
2239
2240 if (vp->v_specmountpoint != NULL)
2241 return (EBUSY);
2242 if (vp->v_flag & VALIASED) {
2243 simple_lock(&spechash_slock);
2244 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2245 if (vq->v_rdev != vp->v_rdev ||
2246 vq->v_type != vp->v_type)
2247 continue;
2248 if (vq->v_specmountpoint != NULL) {
2249 error = EBUSY;
2250 break;
2251 }
2252 }
2253 simple_unlock(&spechash_slock);
2254 }
2255 return (error);
2256 }
2257
2258 static int
2259 sacheck(struct sockaddr *sa)
2260 {
2261 switch (sa->sa_family) {
2262 #ifdef INET
2263 case AF_INET: {
2264 struct sockaddr_in *sin = (struct sockaddr_in *)sa;
2265 char *p = (char *)sin->sin_zero;
2266 size_t i;
2267
2268 if (sin->sin_len != sizeof(*sin))
2269 return -1;
2270 if (sin->sin_port != 0)
2271 return -1;
2272 for (i = 0; i < sizeof(sin->sin_zero); i++)
2273 if (*p++ != '\0')
2274 return -1;
2275 return 0;
2276 }
2277 #endif
2278 #ifdef INET6
2279 case AF_INET6: {
2280 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
2281
2282 if (sin6->sin6_len != sizeof(*sin6))
2283 return -1;
2284 if (sin6->sin6_port != 0)
2285 return -1;
2286 return 0;
2287 }
2288 #endif
2289 default:
2290 return -1;
2291 }
2292 }
2293
2294 /*
2295 * Build hash lists of net addresses and hang them off the mount point.
2296 * Called by ufs_mount() to set up the lists of export addresses.
2297 */
2298 static int
2299 vfs_hang_addrlist(mp, nep, argp)
2300 struct mount *mp;
2301 struct netexport *nep;
2302 struct export_args *argp;
2303 {
2304 struct netcred *np, *enp;
2305 struct radix_node_head *rnh;
2306 int i;
2307 struct sockaddr *saddr, *smask = 0;
2308 struct domain *dom;
2309 int error;
2310
2311 if (argp->ex_addrlen == 0) {
2312 if (mp->mnt_flag & MNT_DEFEXPORTED)
2313 return (EPERM);
2314 np = &nep->ne_defexported;
2315 np->netc_exflags = argp->ex_flags;
2316 crcvt(&np->netc_anon, &argp->ex_anon);
2317 np->netc_anon.cr_ref = 1;
2318 mp->mnt_flag |= MNT_DEFEXPORTED;
2319 return (0);
2320 }
2321
2322 if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN)
2323 return (EINVAL);
2324
2325 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2326 np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
2327 memset((caddr_t)np, 0, i);
2328 saddr = (struct sockaddr *)(np + 1);
2329 error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
2330 if (error)
2331 goto out;
2332 if (saddr->sa_len > argp->ex_addrlen)
2333 saddr->sa_len = argp->ex_addrlen;
2334 if (sacheck(saddr) == -1)
2335 return EINVAL;
2336 if (argp->ex_masklen) {
2337 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
2338 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
2339 if (error)
2340 goto out;
2341 if (smask->sa_len > argp->ex_masklen)
2342 smask->sa_len = argp->ex_masklen;
2343 if (smask->sa_family != saddr->sa_family)
2344 return EINVAL;
2345 if (sacheck(smask) == -1)
2346 return EINVAL;
2347 }
2348 i = saddr->sa_family;
2349 if ((rnh = nep->ne_rtable[i]) == 0) {
2350 /*
2351 * Seems silly to initialize every AF when most are not
2352 * used, do so on demand here
2353 */
2354 DOMAIN_FOREACH(dom) {
2355 if (dom->dom_family == i && dom->dom_rtattach) {
2356 dom->dom_rtattach((void **)&nep->ne_rtable[i],
2357 dom->dom_rtoffset);
2358 break;
2359 }
2360 }
2361 if ((rnh = nep->ne_rtable[i]) == 0) {
2362 error = ENOBUFS;
2363 goto out;
2364 }
2365 }
2366
2367 enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh,
2368 np->netc_rnodes);
2369 if (enp != np) {
2370 if (enp == NULL) {
2371 enp = (struct netcred *)(*rnh->rnh_lookup)(saddr,
2372 smask, rnh);
2373 if (enp == NULL) {
2374 error = EPERM;
2375 goto out;
2376 }
2377 } else
2378 enp->netc_refcnt++;
2379
2380 goto check;
2381 } else
2382 enp->netc_refcnt = 1;
2383
2384 np->netc_exflags = argp->ex_flags;
2385 crcvt(&np->netc_anon, &argp->ex_anon);
2386 np->netc_anon.cr_ref = 1;
2387 return 0;
2388 check:
2389 if (enp->netc_exflags != argp->ex_flags ||
2390 crcmp(&enp->netc_anon, &argp->ex_anon) != 0)
2391 error = EPERM;
2392 else
2393 error = 0;
2394 out:
2395 free(np, M_NETADDR);
2396 return error;
2397 }
2398
2399 /* ARGSUSED */
2400 static int
2401 vfs_free_netcred(rn, w)
2402 struct radix_node *rn;
2403 void *w;
2404 {
2405 struct radix_node_head *rnh = (struct radix_node_head *)w;
2406 struct netcred *np = (struct netcred *)(void *)rn;
2407
2408 (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
2409 if (--(np->netc_refcnt) <= 0)
2410 free(np, M_NETADDR);
2411 return (0);
2412 }
2413
2414 /*
2415 * Free the net address hash lists that are hanging off the mount points.
2416 */
2417 static void
2418 vfs_free_addrlist(nep)
2419 struct netexport *nep;
2420 {
2421 int i;
2422 struct radix_node_head *rnh;
2423
2424 for (i = 0; i <= AF_MAX; i++)
2425 if ((rnh = nep->ne_rtable[i]) != NULL) {
2426 (*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
2427 free((caddr_t)rnh, M_RTABLE);
2428 nep->ne_rtable[i] = 0;
2429 }
2430 }
2431
2432 int
2433 vfs_export(mp, nep, argp)
2434 struct mount *mp;
2435 struct netexport *nep;
2436 struct export_args *argp;
2437 {
2438 int error;
2439
2440 if (argp->ex_flags & MNT_DELEXPORT) {
2441 if (mp->mnt_flag & MNT_EXPUBLIC) {
2442 vfs_setpublicfs(NULL, NULL, NULL);
2443 mp->mnt_flag &= ~MNT_EXPUBLIC;
2444 }
2445 vfs_free_addrlist(nep);
2446 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2447 }
2448 if (argp->ex_flags & MNT_EXPORTED) {
2449 if (argp->ex_flags & MNT_EXPUBLIC) {
2450 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2451 return (error);
2452 mp->mnt_flag |= MNT_EXPUBLIC;
2453 }
2454 if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
2455 return (error);
2456 mp->mnt_flag |= MNT_EXPORTED;
2457 }
2458 return (0);
2459 }
2460
2461 /*
2462 * Set the publicly exported filesystem (WebNFS). Currently, only
2463 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2464 */
2465 int
2466 vfs_setpublicfs(mp, nep, argp)
2467 struct mount *mp;
2468 struct netexport *nep;
2469 struct export_args *argp;
2470 {
2471 int error;
2472 struct vnode *rvp;
2473 char *cp;
2474
2475 /*
2476 * mp == NULL -> invalidate the current info, the FS is
2477 * no longer exported. May be called from either vfs_export
2478 * or unmount, so check if it hasn't already been done.
2479 */
2480 if (mp == NULL) {
2481 if (nfs_pub.np_valid) {
2482 nfs_pub.np_valid = 0;
2483 if (nfs_pub.np_index != NULL) {
2484 FREE(nfs_pub.np_index, M_TEMP);
2485 nfs_pub.np_index = NULL;
2486 }
2487 }
2488 return (0);
2489 }
2490
2491 /*
2492 * Only one allowed at a time.
2493 */
2494 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2495 return (EBUSY);
2496
2497 /*
2498 * Get real filehandle for root of exported FS.
2499 */
2500 memset((caddr_t)&nfs_pub.np_handle, 0, sizeof(nfs_pub.np_handle));
2501 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsidx;
2502
2503 if ((error = VFS_ROOT(mp, &rvp)))
2504 return (error);
2505
2506 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2507 return (error);
2508
2509 vput(rvp);
2510
2511 /*
2512 * If an indexfile was specified, pull it in.
2513 */
2514 if (argp->ex_indexfile != NULL) {
2515 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2516 M_WAITOK);
2517 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2518 MAXNAMLEN, (size_t *)0);
2519 if (!error) {
2520 /*
2521 * Check for illegal filenames.
2522 */
2523 for (cp = nfs_pub.np_index; *cp; cp++) {
2524 if (*cp == '/') {
2525 error = EINVAL;
2526 break;
2527 }
2528 }
2529 }
2530 if (error) {
2531 FREE(nfs_pub.np_index, M_TEMP);
2532 return (error);
2533 }
2534 }
2535
2536 nfs_pub.np_mount = mp;
2537 nfs_pub.np_valid = 1;
2538 return (0);
2539 }
2540
2541 struct netcred *
2542 vfs_export_lookup(mp, nep, nam)
2543 struct mount *mp;
2544 struct netexport *nep;
2545 struct mbuf *nam;
2546 {
2547 struct netcred *np;
2548 struct radix_node_head *rnh;
2549 struct sockaddr *saddr;
2550
2551 np = NULL;
2552 if (mp->mnt_flag & MNT_EXPORTED) {
2553 /*
2554 * Lookup in the export list first.
2555 */
2556 if (nam != NULL) {
2557 saddr = mtod(nam, struct sockaddr *);
2558 rnh = nep->ne_rtable[saddr->sa_family];
2559 if (rnh != NULL) {
2560 np = (struct netcred *)
2561 (*rnh->rnh_matchaddr)((caddr_t)saddr,
2562 rnh);
2563 if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2564 np = NULL;
2565 }
2566 }
2567 /*
2568 * If no address match, use the default if it exists.
2569 */
2570 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2571 np = &nep->ne_defexported;
2572 }
2573 return (np);
2574 }
2575
2576 /*
2577 * Do the usual access checking.
2578 * file_mode, uid and gid are from the vnode in question,
2579 * while acc_mode and cred are from the VOP_ACCESS parameter list
2580 */
2581 int
2582 vaccess(type, file_mode, uid, gid, acc_mode, cred)
2583 enum vtype type;
2584 mode_t file_mode;
2585 uid_t uid;
2586 gid_t gid;
2587 mode_t acc_mode;
2588 struct ucred *cred;
2589 {
2590 mode_t mask;
2591
2592 /*
2593 * Super-user always gets read/write access, but execute access depends
2594 * on at least one execute bit being set.
2595 */
2596 if (cred->cr_uid == 0) {
2597 if ((acc_mode & VEXEC) && type != VDIR &&
2598 (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
2599 return (EACCES);
2600 return (0);
2601 }
2602
2603 mask = 0;
2604
2605 /* Otherwise, check the owner. */
2606 if (cred->cr_uid == uid) {
2607 if (acc_mode & VEXEC)
2608 mask |= S_IXUSR;
2609 if (acc_mode & VREAD)
2610 mask |= S_IRUSR;
2611 if (acc_mode & VWRITE)
2612 mask |= S_IWUSR;
2613 return ((file_mode & mask) == mask ? 0 : EACCES);
2614 }
2615
2616 /* Otherwise, check the groups. */
2617 if (cred->cr_gid == gid || groupmember(gid, cred)) {
2618 if (acc_mode & VEXEC)
2619 mask |= S_IXGRP;
2620 if (acc_mode & VREAD)
2621 mask |= S_IRGRP;
2622 if (acc_mode & VWRITE)
2623 mask |= S_IWGRP;
2624 return ((file_mode & mask) == mask ? 0 : EACCES);
2625 }
2626
2627 /* Otherwise, check everyone else. */
2628 if (acc_mode & VEXEC)
2629 mask |= S_IXOTH;
2630 if (acc_mode & VREAD)
2631 mask |= S_IROTH;
2632 if (acc_mode & VWRITE)
2633 mask |= S_IWOTH;
2634 return ((file_mode & mask) == mask ? 0 : EACCES);
2635 }
2636
2637 /*
2638 * Unmount all file systems.
2639 * We traverse the list in reverse order under the assumption that doing so
2640 * will avoid needing to worry about dependencies.
2641 */
2642 void
2643 vfs_unmountall(p)
2644 struct proc *p;
2645 {
2646 struct mount *mp, *nmp;
2647 int allerror, error;
2648
2649 printf("unmounting file systems...");
2650 for (allerror = 0,
2651 mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2652 nmp = mp->mnt_list.cqe_prev;
2653 #ifdef DEBUG
2654 printf("\nunmounting %s (%s)...",
2655 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
2656 #endif
2657 /*
2658 * XXX Freeze syncer. Must do this before locking the
2659 * mount point. See dounmount() for details.
2660 */
2661 lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL);
2662 if (vfs_busy(mp, 0, 0)) {
2663 lockmgr(&syncer_lock, LK_RELEASE, NULL);
2664 continue;
2665 }
2666 if ((error = dounmount(mp, MNT_FORCE, p)) != 0) {
2667 printf("unmount of %s failed with error %d\n",
2668 mp->mnt_stat.f_mntonname, error);
2669 allerror = 1;
2670 }
2671 }
2672 printf(" done\n");
2673 if (allerror)
2674 printf("WARNING: some file systems would not unmount\n");
2675 }
2676
2677 extern struct simplelock bqueue_slock; /* XXX */
2678
2679 /*
2680 * Sync and unmount file systems before shutting down.
2681 */
2682 void
2683 vfs_shutdown()
2684 {
2685 struct lwp *l = curlwp;
2686 struct proc *p;
2687
2688 /* XXX we're certainly not running in proc0's context! */
2689 if (l == NULL || (p = l->l_proc) == NULL)
2690 p = &proc0;
2691
2692 printf("syncing disks... ");
2693
2694 /* remove user process from run queue */
2695 suspendsched();
2696 (void) spl0();
2697
2698 /* avoid coming back this way again if we panic. */
2699 doing_shutdown = 1;
2700
2701 sys_sync(l, NULL, NULL);
2702
2703 /* Wait for sync to finish. */
2704 if (buf_syncwait() != 0) {
2705 #if defined(DDB) && defined(DEBUG_HALT_BUSY)
2706 Debugger();
2707 #endif
2708 printf("giving up\n");
2709 return;
2710 } else
2711 printf("done\n");
2712
2713 /*
2714 * If we've panic'd, don't make the situation potentially
2715 * worse by unmounting the file systems.
2716 */
2717 if (panicstr != NULL)
2718 return;
2719
2720 /* Release inodes held by texts before update. */
2721 #ifdef notdef
2722 vnshutdown();
2723 #endif
2724 /* Unmount file systems. */
2725 vfs_unmountall(p);
2726 }
2727
2728 /*
2729 * Mount the root file system. If the operator didn't specify a
2730 * file system to use, try all possible file systems until one
2731 * succeeds.
2732 */
2733 int
2734 vfs_mountroot()
2735 {
2736 struct vfsops *v;
2737 int error = ENODEV;
2738
2739 if (root_device == NULL)
2740 panic("vfs_mountroot: root device unknown");
2741
2742 switch (root_device->dv_class) {
2743 case DV_IFNET:
2744 if (rootdev != NODEV)
2745 panic("vfs_mountroot: rootdev set for DV_IFNET "
2746 "(0x%08x -> %d,%d)", rootdev,
2747 major(rootdev), minor(rootdev));
2748 break;
2749
2750 case DV_DISK:
2751 if (rootdev == NODEV)
2752 panic("vfs_mountroot: rootdev not set for DV_DISK");
2753 if (bdevvp(rootdev, &rootvp))
2754 panic("vfs_mountroot: can't get vnode for rootdev");
2755 error = VOP_OPEN(rootvp, FREAD, FSCRED, curproc);
2756 if (error) {
2757 printf("vfs_mountroot: can't open root device\n");
2758 return (error);
2759 }
2760 break;
2761
2762 default:
2763 printf("%s: inappropriate for root file system\n",
2764 root_device->dv_xname);
2765 return (ENODEV);
2766 }
2767
2768 /*
2769 * If user specified a file system, use it.
2770 */
2771 if (mountroot != NULL) {
2772 error = (*mountroot)();
2773 goto done;
2774 }
2775
2776 /*
2777 * Try each file system currently configured into the kernel.
2778 */
2779 LIST_FOREACH(v, &vfs_list, vfs_list) {
2780 if (v->vfs_mountroot == NULL)
2781 continue;
2782 #ifdef DEBUG
2783 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
2784 #endif
2785 error = (*v->vfs_mountroot)();
2786 if (!error) {
2787 aprint_normal("root file system type: %s\n",
2788 v->vfs_name);
2789 break;
2790 }
2791 }
2792
2793 if (v == NULL) {
2794 printf("no file system for %s", root_device->dv_xname);
2795 if (root_device->dv_class == DV_DISK)
2796 printf(" (dev 0x%x)", rootdev);
2797 printf("\n");
2798 error = EFTYPE;
2799 }
2800
2801 done:
2802 if (error && root_device->dv_class == DV_DISK) {
2803 VOP_CLOSE(rootvp, FREAD, FSCRED, curproc);
2804 vrele(rootvp);
2805 }
2806 return (error);
2807 }
2808
2809 /*
2810 * Given a file system name, look up the vfsops for that
2811 * file system, or return NULL if file system isn't present
2812 * in the kernel.
2813 */
2814 struct vfsops *
2815 vfs_getopsbyname(name)
2816 const char *name;
2817 {
2818 struct vfsops *v;
2819
2820 LIST_FOREACH(v, &vfs_list, vfs_list) {
2821 if (strcmp(v->vfs_name, name) == 0)
2822 break;
2823 }
2824
2825 return (v);
2826 }
2827
2828 /*
2829 * Establish a file system and initialize it.
2830 */
2831 int
2832 vfs_attach(vfs)
2833 struct vfsops *vfs;
2834 {
2835 struct vfsops *v;
2836 int error = 0;
2837
2838
2839 /*
2840 * Make sure this file system doesn't already exist.
2841 */
2842 LIST_FOREACH(v, &vfs_list, vfs_list) {
2843 if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
2844 error = EEXIST;
2845 goto out;
2846 }
2847 }
2848
2849 /*
2850 * Initialize the vnode operations for this file system.
2851 */
2852 vfs_opv_init(vfs->vfs_opv_descs);
2853
2854 /*
2855 * Now initialize the file system itself.
2856 */
2857 (*vfs->vfs_init)();
2858
2859 /*
2860 * ...and link it into the kernel's list.
2861 */
2862 LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
2863
2864 /*
2865 * Sanity: make sure the reference count is 0.
2866 */
2867 vfs->vfs_refcount = 0;
2868
2869 out:
2870 return (error);
2871 }
2872
2873 /*
2874 * Remove a file system from the kernel.
2875 */
2876 int
2877 vfs_detach(vfs)
2878 struct vfsops *vfs;
2879 {
2880 struct vfsops *v;
2881
2882 /*
2883 * Make sure no one is using the filesystem.
2884 */
2885 if (vfs->vfs_refcount != 0)
2886 return (EBUSY);
2887
2888 /*
2889 * ...and remove it from the kernel's list.
2890 */
2891 LIST_FOREACH(v, &vfs_list, vfs_list) {
2892 if (v == vfs) {
2893 LIST_REMOVE(v, vfs_list);
2894 break;
2895 }
2896 }
2897
2898 if (v == NULL)
2899 return (ESRCH);
2900
2901 /*
2902 * Now run the file system-specific cleanups.
2903 */
2904 (*vfs->vfs_done)();
2905
2906 /*
2907 * Free the vnode operations vector.
2908 */
2909 vfs_opv_free(vfs->vfs_opv_descs);
2910 return (0);
2911 }
2912
2913 void
2914 vfs_reinit(void)
2915 {
2916 struct vfsops *vfs;
2917
2918 LIST_FOREACH(vfs, &vfs_list, vfs_list) {
2919 if (vfs->vfs_reinit) {
2920 (*vfs->vfs_reinit)();
2921 }
2922 }
2923 }
2924
2925 /*
2926 * Request a filesystem to suspend write operations.
2927 */
2928 int
2929 vfs_write_suspend(struct mount *mp, int slpflag, int slptimeo)
2930 {
2931 struct proc *p = curproc; /* XXX */
2932 int error;
2933
2934 while ((mp->mnt_iflag & IMNT_SUSPEND)) {
2935 if (slptimeo < 0)
2936 return EWOULDBLOCK;
2937 error = tsleep(&mp->mnt_flag, slpflag, "suspwt1", slptimeo);
2938 if (error)
2939 return error;
2940 }
2941 mp->mnt_iflag |= IMNT_SUSPEND;
2942
2943 simple_lock(&mp->mnt_slock);
2944 if (mp->mnt_writeopcountupper > 0)
2945 ltsleep(&mp->mnt_writeopcountupper, PUSER - 1, "suspwt",
2946 0, &mp->mnt_slock);
2947 simple_unlock(&mp->mnt_slock);
2948
2949 error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p);
2950 if (error) {
2951 vfs_write_resume(mp);
2952 return error;
2953 }
2954 mp->mnt_iflag |= IMNT_SUSPENDLOW;
2955
2956 simple_lock(&mp->mnt_slock);
2957 if (mp->mnt_writeopcountlower > 0)
2958 ltsleep(&mp->mnt_writeopcountlower, PUSER - 1, "suspwt",
2959 0, &mp->mnt_slock);
2960 mp->mnt_iflag |= IMNT_SUSPENDED;
2961 simple_unlock(&mp->mnt_slock);
2962
2963 return 0;
2964 }
2965
2966 /*
2967 * Request a filesystem to resume write operations.
2968 */
2969 void
2970 vfs_write_resume(struct mount *mp)
2971 {
2972
2973 if ((mp->mnt_iflag & IMNT_SUSPEND) == 0)
2974 return;
2975 mp->mnt_iflag &= ~(IMNT_SUSPEND | IMNT_SUSPENDLOW | IMNT_SUSPENDED);
2976 wakeup(&mp->mnt_flag);
2977 }
2978
2979 void
2980 copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
2981 {
2982 const struct statvfs *mbp;
2983
2984 if (sbp == (mbp = &mp->mnt_stat))
2985 return;
2986
2987 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
2988 sbp->f_fsid = mbp->f_fsid;
2989 sbp->f_owner = mbp->f_owner;
2990 sbp->f_flag = mbp->f_flag;
2991 sbp->f_syncwrites = mbp->f_syncwrites;
2992 sbp->f_asyncwrites = mbp->f_asyncwrites;
2993 sbp->f_syncreads = mbp->f_syncreads;
2994 sbp->f_asyncreads = mbp->f_asyncreads;
2995 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
2996 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
2997 sizeof(sbp->f_fstypename));
2998 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
2999 sizeof(sbp->f_mntonname));
3000 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
3001 sizeof(sbp->f_mntfromname));
3002 sbp->f_namemax = mbp->f_namemax;
3003 }
3004
3005 int
3006 set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
3007 struct mount *mp, struct proc *p)
3008 {
3009 int error;
3010 size_t size;
3011 struct statvfs *sfs = &mp->mnt_stat;
3012 int (*fun)(const void *, void *, size_t, size_t *);
3013
3014 (void)strncpy(mp->mnt_stat.f_fstypename, mp->mnt_op->vfs_name,
3015 sizeof(mp->mnt_stat.f_fstypename));
3016
3017 if (onp) {
3018 struct cwdinfo *cwdi = p->p_cwdi;
3019 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
3020 if (cwdi->cwdi_rdir != NULL) {
3021 size_t len;
3022 char *bp;
3023 char *path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
3024
3025 if (!path) /* XXX can't happen with M_WAITOK */
3026 return ENOMEM;
3027
3028 bp = path + MAXPATHLEN;
3029 *--bp = '\0';
3030 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
3031 path, MAXPATHLEN / 2, 0, p);
3032 if (error) {
3033 free(path, M_TEMP);
3034 return error;
3035 }
3036
3037 len = strlen(bp);
3038 if (len > sizeof(sfs->f_mntonname) - 1)
3039 len = sizeof(sfs->f_mntonname) - 1;
3040 (void)strncpy(sfs->f_mntonname, bp, len);
3041 free(path, M_TEMP);
3042
3043 if (len < sizeof(sfs->f_mntonname) - 1) {
3044 error = (*fun)(onp, &sfs->f_mntonname[len],
3045 sizeof(sfs->f_mntonname) - len - 1, &size);
3046 if (error)
3047 return error;
3048 size += len;
3049 } else {
3050 size = len;
3051 }
3052 } else {
3053 error = (*fun)(onp, &sfs->f_mntonname,
3054 sizeof(sfs->f_mntonname) - 1, &size);
3055 if (error)
3056 return error;
3057 }
3058 (void)memset(sfs->f_mntonname + size, 0,
3059 sizeof(sfs->f_mntonname) - size);
3060 }
3061
3062 if (fromp) {
3063 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
3064 error = (*fun)(fromp, sfs->f_mntfromname,
3065 sizeof(sfs->f_mntfromname) - 1, &size);
3066 if (error)
3067 return error;
3068 (void)memset(sfs->f_mntfromname + size, 0,
3069 sizeof(sfs->f_mntfromname) - size);
3070 }
3071 return 0;
3072 }
3073
3074 /*
3075 * Default vfs_extattrctl routine for file systems that do not support
3076 * it.
3077 */
3078 /*ARGSUSED*/
3079 int
3080 vfs_stdextattrctl(struct mount *mp, int cmt, struct vnode *vp,
3081 int attrnamespace, const char *attrname, struct proc *p)
3082 {
3083
3084 if (vp != NULL)
3085 VOP_UNLOCK(vp, 0);
3086 return (EOPNOTSUPP);
3087 }
3088
3089 /*
3090 * Credential check based on process requesting service, and per-attribute
3091 * permissions.
3092 *
3093 * NOTE: Vnode must be locked.
3094 */
3095 int
3096 extattr_check_cred(struct vnode *vp, int attrnamespace,
3097 struct ucred *cred, struct proc *p, int access)
3098 {
3099
3100 if (cred == NOCRED)
3101 return (0);
3102
3103 switch (attrnamespace) {
3104 case EXTATTR_NAMESPACE_SYSTEM:
3105 /*
3106 * Do we really want to allow this, or just require that
3107 * these requests come from kernel code (NOCRED case above)?
3108 */
3109 return (suser(cred, &p->p_acflag));
3110
3111 case EXTATTR_NAMESPACE_USER:
3112 return (VOP_ACCESS(vp, access, cred, p));
3113
3114 default:
3115 return (EPERM);
3116 }
3117 }
3118
3119 #ifdef DDB
3120 const char buf_flagbits[] = BUF_FLAGBITS;
3121
3122 void
3123 vfs_buf_print(bp, full, pr)
3124 struct buf *bp;
3125 int full;
3126 void (*pr)(const char *, ...);
3127 {
3128 char buf[1024];
3129
3130 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" dev 0x%x\n",
3131 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_dev);
3132
3133 bitmask_snprintf(bp->b_flags, buf_flagbits, buf, sizeof(buf));
3134 (*pr)(" error %d flags 0x%s\n", bp->b_error, buf);
3135
3136 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
3137 bp->b_bufsize, bp->b_bcount, bp->b_resid);
3138 (*pr)(" data %p saveaddr %p dep %p\n",
3139 bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep));
3140 (*pr)(" iodone %p\n", bp->b_iodone);
3141 }
3142
3143
3144 void
3145 vfs_vnode_print(vp, full, pr)
3146 struct vnode *vp;
3147 int full;
3148 void (*pr)(const char *, ...);
3149 {
3150 char buf[256];
3151
3152 uvm_object_printit(&vp->v_uobj, full, pr);
3153 bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf));
3154 (*pr)("\nVNODE flags %s\n", buf);
3155 (*pr)("mp %p numoutput %d size 0x%llx\n",
3156 vp->v_mount, vp->v_numoutput, vp->v_size);
3157
3158 (*pr)("data %p usecount %d writecount %ld holdcnt %ld numoutput %d\n",
3159 vp->v_data, vp->v_usecount, vp->v_writecount,
3160 vp->v_holdcnt, vp->v_numoutput);
3161
3162 (*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
3163 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
3164 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
3165 vp->v_mount, vp->v_mountedhere);
3166
3167 if (full) {
3168 struct buf *bp;
3169
3170 (*pr)("clean bufs:\n");
3171 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
3172 (*pr)(" bp %p\n", bp);
3173 vfs_buf_print(bp, full, pr);
3174 }
3175
3176 (*pr)("dirty bufs:\n");
3177 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
3178 (*pr)(" bp %p\n", bp);
3179 vfs_buf_print(bp, full, pr);
3180 }
3181 }
3182 }
3183
3184 void
3185 vfs_mount_print(mp, full, pr)
3186 struct mount *mp;
3187 int full;
3188 void (*pr)(const char *, ...);
3189 {
3190 char sbuf[256];
3191
3192 (*pr)("vnodecovered = %p syncer = %p data = %p\n",
3193 mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data);
3194
3195 (*pr)("fs_bshift %d dev_bshift = %d\n",
3196 mp->mnt_fs_bshift,mp->mnt_dev_bshift);
3197
3198 bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf));
3199 (*pr)("flag = %s\n", sbuf);
3200
3201 bitmask_snprintf(mp->mnt_iflag, __IMNT_FLAG_BITS, sbuf, sizeof(sbuf));
3202 (*pr)("iflag = %s\n", sbuf);
3203
3204 /* XXX use lockmgr_printinfo */
3205 if (mp->mnt_lock.lk_sharecount)
3206 (*pr)(" lock type %s: SHARED (count %d)", mp->mnt_lock.lk_wmesg,
3207 mp->mnt_lock.lk_sharecount);
3208 else if (mp->mnt_lock.lk_flags & LK_HAVE_EXCL) {
3209 (*pr)(" lock type %s: EXCL (count %d) by ",
3210 mp->mnt_lock.lk_wmesg, mp->mnt_lock.lk_exclusivecount);
3211 if (mp->mnt_lock.lk_flags & LK_SPIN)
3212 (*pr)("processor %lu", mp->mnt_lock.lk_cpu);
3213 else
3214 (*pr)("pid %d.%d", mp->mnt_lock.lk_lockholder,
3215 mp->mnt_lock.lk_locklwp);
3216 } else
3217 (*pr)(" not locked");
3218 if ((mp->mnt_lock.lk_flags & LK_SPIN) == 0 && mp->mnt_lock.lk_waitcount > 0)
3219 (*pr)(" with %d pending", mp->mnt_lock.lk_waitcount);
3220
3221 (*pr)("\n");
3222
3223 if (mp->mnt_unmounter) {
3224 (*pr)("unmounter pid = %d ",mp->mnt_unmounter->p_pid);
3225 }
3226 (*pr)("wcnt = %d, writeopcountupper = %d, writeopcountupper = %d\n",
3227 mp->mnt_wcnt,mp->mnt_writeopcountupper,mp->mnt_writeopcountlower);
3228
3229 (*pr)("statvfs cache:\n");
3230 (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize);
3231 (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize);
3232 (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize);
3233
3234 (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks);
3235 (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree);
3236 (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail);
3237 (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd);
3238
3239 (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files);
3240 (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree);
3241 (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail);
3242 (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd);
3243
3244 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
3245 mp->mnt_stat.f_fsidx.__fsid_val[0],
3246 mp->mnt_stat.f_fsidx.__fsid_val[1]);
3247
3248 (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner);
3249 (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax);
3250
3251 bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf,
3252 sizeof(sbuf));
3253 (*pr)("\tflag = %s\n",sbuf);
3254 (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites);
3255 (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites);
3256 (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads);
3257 (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads);
3258 (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename);
3259 (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname);
3260 (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname);
3261
3262 {
3263 int cnt = 0;
3264 struct vnode *vp;
3265 (*pr)("locked vnodes =");
3266 /* XXX would take mountlist lock, except ddb may not have context */
3267 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3268 if (VOP_ISLOCKED(vp)) {
3269 if ((++cnt % 6) == 0) {
3270 (*pr)(" %p,\n\t", vp);
3271 } else {
3272 (*pr)(" %p,", vp);
3273 }
3274 }
3275 }
3276 (*pr)("\n");
3277 }
3278
3279 if (full) {
3280 int cnt = 0;
3281 struct vnode *vp;
3282 (*pr)("all vnodes =");
3283 /* XXX would take mountlist lock, except ddb may not have context */
3284 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
3285 if (!LIST_NEXT(vp, v_mntvnodes)) {
3286 (*pr)(" %p", vp);
3287 } else if ((++cnt % 6) == 0) {
3288 (*pr)(" %p,\n\t", vp);
3289 } else {
3290 (*pr)(" %p,", vp);
3291 }
3292 }
3293 (*pr)("\n", vp);
3294 }
3295 }
3296
3297 #endif
Cache object: 18202f87804353b67c1d2d03c458e7e9
|