1 /* $NetBSD: lfs_syscalls.c,v 1.100 2003/12/04 14:57:47 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.100 2003/12/04 14:57:47 yamt Exp $");
71
72 #ifndef LFS
73 # define LFS /* for prototypes in syscallargs.h */
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/mount.h>
81 #include <sys/vnode.h>
82 #include <sys/malloc.h>
83 #include <sys/kernel.h>
84
85 #include <sys/sa.h>
86 #include <sys/syscallargs.h>
87
88 #include <ufs/ufs/inode.h>
89 #include <ufs/ufs/ufsmount.h>
90 #include <ufs/ufs/ufs_extern.h>
91
92 #include <ufs/lfs/lfs.h>
93 #include <ufs/lfs/lfs_extern.h>
94
95 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
96 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
97
98 int debug_cleaner = 0;
99 int clean_vnlocked = 0;
100 int clean_inlocked = 0;
101 int verbose_debug = 0;
102
103 pid_t lfs_cleaner_pid = 0;
104
105 #define LFS_FORCE_WRITE UNASSIGNED
106
107 /*
108 * sys_lfs_markv:
109 *
110 * This will mark inodes and blocks dirty, so they are written into the log.
111 * It will block until all the blocks have been written. The segment create
112 * time passed in the block_info and inode_info structures is used to decide
113 * if the data is valid for each block (in case some process dirtied a block
114 * or inode that is being cleaned between the determination that a block is
115 * live and the lfs_markv call).
116 *
117 * 0 on success
118 * -1/errno is return on error.
119 */
120 #ifdef USE_64BIT_SYSCALLS
121 int
122 sys_lfs_markv(struct proc *p, void *v, register_t *retval)
123 {
124 struct sys_lfs_markv_args /* {
125 syscallarg(fsid_t *) fsidp;
126 syscallarg(struct block_info *) blkiov;
127 syscallarg(int) blkcnt;
128 } */ *uap = v;
129 BLOCK_INFO *blkiov;
130 int blkcnt, error;
131 fsid_t fsid;
132
133 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
134 return (error);
135
136 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
137 return (error);
138
139 blkcnt = SCARG(uap, blkcnt);
140 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
141 return (EINVAL);
142
143 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
144 if ((error = copyin(SCARG(uap, blkiov), blkiov,
145 blkcnt * sizeof(BLOCK_INFO))) != 0)
146 goto out;
147
148 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
149 copyout(blkiov, SCARG(uap, blkiov),
150 blkcnt * sizeof(BLOCK_INFO));
151 out:
152 free(blkiov, M_SEGMENT);
153 return error;
154 }
155 #else
156 int
157 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
158 {
159 struct sys_lfs_markv_args /* {
160 syscallarg(fsid_t *) fsidp;
161 syscallarg(struct block_info *) blkiov;
162 syscallarg(int) blkcnt;
163 } */ *uap = v;
164 BLOCK_INFO *blkiov;
165 BLOCK_INFO_15 *blkiov15;
166 int i, blkcnt, error;
167 fsid_t fsid;
168
169 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0)
170 return (error);
171
172 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
173 return (error);
174
175 blkcnt = SCARG(uap, blkcnt);
176 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
177 return (EINVAL);
178
179 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
180 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
181 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
182 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
183 goto out;
184
185 for (i = 0; i < blkcnt; i++) {
186 blkiov[i].bi_inode = blkiov15[i].bi_inode;
187 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
188 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
189 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
190 blkiov[i].bi_version = blkiov15[i].bi_version;
191 blkiov[i].bi_bp = blkiov15[i].bi_bp;
192 blkiov[i].bi_size = blkiov15[i].bi_size;
193 }
194
195 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
196 for (i = 0; i < blkcnt; i++) {
197 blkiov15[i].bi_inode = blkiov[i].bi_inode;
198 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
199 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
200 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
201 blkiov15[i].bi_version = blkiov[i].bi_version;
202 blkiov15[i].bi_bp = blkiov[i].bi_bp;
203 blkiov15[i].bi_size = blkiov[i].bi_size;
204 }
205 copyout(blkiov15, SCARG(uap, blkiov),
206 blkcnt * sizeof(BLOCK_INFO_15));
207 }
208 out:
209 free(blkiov, M_SEGMENT);
210 free(blkiov15, M_SEGMENT);
211 return error;
212 }
213 #endif
214
215 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
216
217 int
218 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
219 {
220 BLOCK_INFO *blkp;
221 IFILE *ifp;
222 struct buf *bp;
223 struct inode *ip = NULL;
224 struct lfs *fs;
225 struct mount *mntp;
226 struct vnode *vp;
227 #ifdef DEBUG_LFS
228 int vputc = 0, iwritten = 0;
229 #endif
230 ino_t lastino;
231 daddr_t b_daddr, v_daddr;
232 int cnt, error;
233 int do_again = 0;
234 int numrefed = 0;
235 ino_t maxino;
236 size_t obsize;
237
238 /* number of blocks/inodes that we have already bwrite'ed */
239 int nblkwritten, ninowritten;
240
241 if ((mntp = vfs_getvfs(fsidp)) == NULL)
242 return (ENOENT);
243
244 fs = VFSTOUFS(mntp)->um_lfs;
245
246 if (fs->lfs_ronly)
247 return EROFS;
248
249 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
250 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
251
252 cnt = blkcnt;
253
254 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
255 return (error);
256
257 /*
258 * This seglock is just to prevent the fact that we might have to sleep
259 * from allowing the possibility that our blocks might become
260 * invalid.
261 *
262 * It is also important to note here that unless we specify SEGM_CKP,
263 * any Ifile blocks that we might be asked to clean will never get
264 * to the disk.
265 */
266 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
267
268 /* Mark blocks/inodes dirty. */
269 error = 0;
270
271 #ifdef DEBUG_LFS
272 /* Run through and count the inodes */
273 lastino = LFS_UNUSED_INUM;
274 for (blkp = blkiov; cnt--; ++blkp) {
275 if (lastino != blkp->bi_inode) {
276 lastino = blkp->bi_inode;
277 vputc++;
278 }
279 }
280 cnt = blkcnt;
281 printf("[%d/",vputc);
282 iwritten = 0;
283 #endif /* DEBUG_LFS */
284 /* these were inside the initialization for the for loop */
285 v_daddr = LFS_UNUSED_DADDR;
286 lastino = LFS_UNUSED_INUM;
287 nblkwritten = ninowritten = 0;
288 for (blkp = blkiov; cnt--; ++blkp)
289 {
290 if (blkp->bi_daddr == LFS_FORCE_WRITE)
291 printf("lfs_markv: warning: force-writing ino %d "
292 "lbn %lld\n",
293 blkp->bi_inode, (long long)blkp->bi_lbn);
294 /* Bounds-check incoming data, avoid panic for failed VGET */
295 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
296 error = EINVAL;
297 goto err3;
298 }
299 /*
300 * Get the IFILE entry (only once) and see if the file still
301 * exists.
302 */
303 if (lastino != blkp->bi_inode) {
304 /*
305 * Finish the old file, if there was one. The presence
306 * of a usable vnode in vp is signaled by a valid v_daddr.
307 */
308 if (v_daddr != LFS_UNUSED_DADDR) {
309 #ifdef DEBUG_LFS
310 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
311 iwritten++;
312 #endif
313 lfs_vunref(vp);
314 numrefed--;
315 }
316
317 /*
318 * Start a new file
319 */
320 lastino = blkp->bi_inode;
321 if (blkp->bi_inode == LFS_IFILE_INUM)
322 v_daddr = fs->lfs_idaddr;
323 else {
324 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
325 /* XXX fix for force write */
326 v_daddr = ifp->if_daddr;
327 brelse(bp);
328 }
329 /* Don't force-write the ifile */
330 if (blkp->bi_inode == LFS_IFILE_INUM
331 && blkp->bi_daddr == LFS_FORCE_WRITE)
332 {
333 continue;
334 }
335 if (v_daddr == LFS_UNUSED_DADDR
336 && blkp->bi_daddr != LFS_FORCE_WRITE)
337 {
338 continue;
339 }
340
341 /* Get the vnode/inode. */
342 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
343 &vp,
344 (blkp->bi_lbn == LFS_UNUSED_LBN
345 ? blkp->bi_bp
346 : NULL));
347
348 if (!error) {
349 numrefed++;
350 }
351 if (error) {
352 #ifdef DEBUG_LFS
353 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n",
354 error, blkp->bi_inode,
355 dtosn(fs, blkp->bi_daddr));
356 #endif /* DEBUG_LFS */
357 /*
358 * If we got EAGAIN, that means that the
359 * Inode was locked. This is
360 * recoverable: just clean the rest of
361 * this segment, and let the cleaner try
362 * again with another. (When the
363 * cleaner runs again, this segment will
364 * sort high on the list, since it is
365 * now almost entirely empty.) But, we
366 * still set v_daddr = LFS_UNUSED_ADDR
367 * so as not to test this over and over
368 * again.
369 */
370 if (error == EAGAIN) {
371 error = 0;
372 do_again++;
373 }
374 #ifdef DIAGNOSTIC
375 else if (error != ENOENT)
376 panic("lfs_markv VFS_VGET FAILED");
377 #endif
378 /* lastino = LFS_UNUSED_INUM; */
379 v_daddr = LFS_UNUSED_DADDR;
380 vp = NULL;
381 ip = NULL;
382 continue;
383 }
384 ip = VTOI(vp);
385 ninowritten++;
386 } else if (v_daddr == LFS_UNUSED_DADDR) {
387 /*
388 * This can only happen if the vnode is dead (or
389 * in any case we can't get it...e.g., it is
390 * inlocked). Keep going.
391 */
392 continue;
393 }
394
395 /* Past this point we are guaranteed that vp, ip are valid. */
396
397 /* If this BLOCK_INFO didn't contain a block, keep going. */
398 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
399 /* XXX need to make sure that the inode gets written in this case */
400 /* XXX but only write the inode if it's the right one */
401 if (blkp->bi_inode != LFS_IFILE_INUM) {
402 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
403 if (ifp->if_daddr == blkp->bi_daddr
404 || blkp->bi_daddr == LFS_FORCE_WRITE)
405 {
406 LFS_SET_UINO(ip, IN_CLEANING);
407 }
408 brelse(bp);
409 }
410 continue;
411 }
412
413 b_daddr = 0;
414 if (blkp->bi_daddr != LFS_FORCE_WRITE) {
415 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
416 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
417 {
418 if (dtosn(fs,dbtofsb(fs, b_daddr))
419 == dtosn(fs,blkp->bi_daddr))
420 {
421 printf("lfs_markv: wrong da same seg: %llx vs %llx\n",
422 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr));
423 }
424 do_again++;
425 continue;
426 }
427 }
428
429 /*
430 * Check block sizes. The blocks being cleaned come from
431 * disk, so they should have the same size as their on-disk
432 * counterparts.
433 */
434 if (blkp->bi_lbn >= 0)
435 obsize = blksize(fs, ip, blkp->bi_lbn);
436 else
437 obsize = fs->lfs_bsize;
438 /* Check for fragment size change */
439 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
440 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
441 }
442 if (obsize != blkp->bi_size) {
443 printf("lfs_markv: ino %d lbn %lld wrong size (%ld != %d), try again\n",
444 blkp->bi_inode, (long long)blkp->bi_lbn,
445 (long) obsize, blkp->bi_size);
446 do_again++;
447 continue;
448 }
449
450 /*
451 * If we get to here, then we are keeping the block. If
452 * it is an indirect block, we want to actually put it
453 * in the buffer cache so that it can be updated in the
454 * finish_meta section. If it's not, we need to
455 * allocate a fake buffer so that writeseg can perform
456 * the copyin and write the buffer.
457 */
458 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
459 /* Data Block */
460 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
461 blkp->bi_size, blkp->bi_bp);
462 /* Pretend we used bread() to get it */
463 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
464 } else {
465 /* Indirect block or ifile */
466 if (blkp->bi_size != fs->lfs_bsize &&
467 ip->i_number != LFS_IFILE_INUM)
468 panic("lfs_markv: partial indirect block?"
469 " size=%d\n", blkp->bi_size);
470 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
471 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
472 /*
473 * The block in question was not found
474 * in the cache; i.e., the block that
475 * getblk() returned is empty. So, we
476 * can (and should) copy in the
477 * contents, because we've already
478 * determined that this was the right
479 * version of this block on disk.
480 *
481 * And, it can't have changed underneath
482 * us, because we have the segment lock.
483 */
484 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
485 if (error)
486 goto err2;
487 }
488 }
489 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
490 goto err2;
491
492 nblkwritten++;
493 /*
494 * XXX should account indirect blocks and ifile pages as well
495 */
496 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
497 > LFS_MARKV_MAX_BLOCKS) {
498 #ifdef DEBUG_LFS
499 printf("lfs_markv: writing %d blks %d inos\n",
500 nblkwritten, ninowritten);
501 #endif
502 lfs_segwrite(mntp, SEGM_CLEAN);
503 nblkwritten = ninowritten = 0;
504 }
505 }
506
507 /*
508 * Finish the old file, if there was one
509 */
510 if (v_daddr != LFS_UNUSED_DADDR) {
511 #ifdef DEBUG_LFS
512 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
513 iwritten++;
514 #endif
515 lfs_vunref(vp);
516 numrefed--;
517 }
518
519 #ifdef DEBUG_LFS
520 printf("%d]",iwritten);
521 if (numrefed != 0) {
522 panic("lfs_markv: numrefed=%d", numrefed);
523 }
524 #endif
525
526 #ifdef DEBUG_LFS
527 printf("lfs_markv: writing %d blks %d inos (check point)\n",
528 nblkwritten, ninowritten);
529 #endif
530 /*
531 * The last write has to be SEGM_SYNC, because of calling semantics.
532 * It also has to be SEGM_CKP, because otherwise we could write
533 * over the newly cleaned data contained in a checkpoint, and then
534 * we'd be unhappy at recovery time.
535 */
536 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
537
538 lfs_segunlock(fs);
539
540 vfs_unbusy(mntp);
541 if (error)
542 return (error);
543 else if (do_again)
544 return EAGAIN;
545
546 return 0;
547
548 err2:
549 printf("lfs_markv err2\n");
550
551 /*
552 * XXX we're here because copyin() failed.
553 * XXX it means that we can't trust the cleanerd. too bad.
554 * XXX how can we recover from this?
555 */
556
557 err3:
558 /*
559 * XXX should do segwrite here anyway?
560 */
561
562 if (v_daddr != LFS_UNUSED_DADDR) {
563 lfs_vunref(vp);
564 --numrefed;
565 }
566
567 lfs_segunlock(fs);
568 vfs_unbusy(mntp);
569 #ifdef DEBUG_LFS
570 if (numrefed != 0) {
571 panic("lfs_markv: numrefed=%d", numrefed);
572 }
573 #endif
574
575 return (error);
576 }
577
578 /*
579 * sys_lfs_bmapv:
580 *
581 * This will fill in the current disk address for arrays of blocks.
582 *
583 * 0 on success
584 * -1/errno is return on error.
585 */
586 #ifdef USE_64BIT_SYSCALLS
587 int
588 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval)
589 {
590 struct sys_lfs_bmapv_args /* {
591 syscallarg(fsid_t *) fsidp;
592 syscallarg(struct block_info *) blkiov;
593 syscallarg(int) blkcnt;
594 } */ *uap = v;
595 BLOCK_INFO *blkiov;
596 int blkcnt, error;
597 fsid_t fsid;
598
599 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
600 return (error);
601
602 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
603 return (error);
604
605 blkcnt = SCARG(uap, blkcnt);
606 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
607 return (EINVAL);
608 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
609 if ((error = copyin(SCARG(uap, blkiov), blkiov,
610 blkcnt * sizeof(BLOCK_INFO))) != 0)
611 goto out;
612
613 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
614 copyout(blkiov, SCARG(uap, blkiov),
615 blkcnt * sizeof(BLOCK_INFO));
616 out:
617 free(blkiov, M_SEGMENT);
618 return error;
619 }
620 #else
621 int
622 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
623 {
624 struct sys_lfs_bmapv_args /* {
625 syscallarg(fsid_t *) fsidp;
626 syscallarg(struct block_info *) blkiov;
627 syscallarg(int) blkcnt;
628 } */ *uap = v;
629 struct proc *p = l->l_proc;
630 BLOCK_INFO *blkiov;
631 BLOCK_INFO_15 *blkiov15;
632 int i, blkcnt, error;
633 fsid_t fsid;
634
635 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
636 return (error);
637
638 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
639 return (error);
640
641 blkcnt = SCARG(uap, blkcnt);
642 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
643 return (EINVAL);
644 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
645 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
646 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
647 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
648 goto out;
649
650 for (i = 0; i < blkcnt; i++) {
651 blkiov[i].bi_inode = blkiov15[i].bi_inode;
652 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
653 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
654 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
655 blkiov[i].bi_version = blkiov15[i].bi_version;
656 blkiov[i].bi_bp = blkiov15[i].bi_bp;
657 blkiov[i].bi_size = blkiov15[i].bi_size;
658 }
659
660 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
661 for (i = 0; i < blkcnt; i++) {
662 blkiov15[i].bi_inode = blkiov[i].bi_inode;
663 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
664 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
665 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
666 blkiov15[i].bi_version = blkiov[i].bi_version;
667 blkiov15[i].bi_bp = blkiov[i].bi_bp;
668 blkiov15[i].bi_size = blkiov[i].bi_size;
669 }
670 copyout(blkiov15, SCARG(uap, blkiov),
671 blkcnt * sizeof(BLOCK_INFO_15));
672 }
673 out:
674 free(blkiov, M_SEGMENT);
675 free(blkiov15, M_SEGMENT);
676 return error;
677 }
678 #endif
679
680 int
681 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
682 {
683 BLOCK_INFO *blkp;
684 IFILE *ifp;
685 struct buf *bp;
686 struct inode *ip = NULL;
687 struct lfs *fs;
688 struct mount *mntp;
689 struct ufsmount *ump;
690 struct vnode *vp;
691 ino_t lastino;
692 daddr_t v_daddr;
693 int cnt, error;
694 int numrefed = 0;
695
696 lfs_cleaner_pid = p->p_pid;
697
698 if ((mntp = vfs_getvfs(fsidp)) == NULL)
699 return (ENOENT);
700
701 ump = VFSTOUFS(mntp);
702 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
703 return (error);
704
705 cnt = blkcnt;
706
707 fs = VFSTOUFS(mntp)->um_lfs;
708
709 error = 0;
710
711 /* these were inside the initialization for the for loop */
712 v_daddr = LFS_UNUSED_DADDR;
713 lastino = LFS_UNUSED_INUM;
714 for (blkp = blkiov; cnt--; ++blkp)
715 {
716 /*
717 * Get the IFILE entry (only once) and see if the file still
718 * exists.
719 */
720 if (lastino != blkp->bi_inode) {
721 /*
722 * Finish the old file, if there was one. The presence
723 * of a usable vnode in vp is signaled by a valid
724 * v_daddr.
725 */
726 if (v_daddr != LFS_UNUSED_DADDR) {
727 lfs_vunref(vp);
728 numrefed--;
729 }
730
731 /*
732 * Start a new file
733 */
734 lastino = blkp->bi_inode;
735 if (blkp->bi_inode == LFS_IFILE_INUM)
736 v_daddr = fs->lfs_idaddr;
737 else {
738 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
739 v_daddr = ifp->if_daddr;
740 brelse(bp);
741 }
742 if (v_daddr == LFS_UNUSED_DADDR) {
743 blkp->bi_daddr = LFS_UNUSED_DADDR;
744 continue;
745 }
746 /*
747 * A regular call to VFS_VGET could deadlock
748 * here. Instead, we try an unlocked access.
749 */
750 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
751 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
752 ip = VTOI(vp);
753 if (lfs_vref(vp)) {
754 v_daddr = LFS_UNUSED_DADDR;
755 continue;
756 }
757 numrefed++;
758 } else {
759 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
760 if (error) {
761 #ifdef DEBUG_LFS
762 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error);
763 #endif
764 v_daddr = LFS_UNUSED_DADDR;
765 continue;
766 } else {
767 KASSERT(VOP_ISLOCKED(vp));
768 VOP_UNLOCK(vp, 0);
769 numrefed++;
770 }
771 }
772 ip = VTOI(vp);
773 } else if (v_daddr == LFS_UNUSED_DADDR) {
774 /*
775 * This can only happen if the vnode is dead.
776 * Keep going. Note that we DO NOT set the
777 * bi_addr to anything -- if we failed to get
778 * the vnode, for example, we want to assume
779 * conservatively that all of its blocks *are*
780 * located in the segment in question.
781 * lfs_markv will throw them out if we are
782 * wrong.
783 */
784 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
785 continue;
786 }
787
788 /* Past this point we are guaranteed that vp, ip are valid. */
789
790 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
791 /*
792 * We just want the inode address, which is
793 * conveniently in v_daddr.
794 */
795 blkp->bi_daddr = v_daddr;
796 } else {
797 daddr_t bi_daddr;
798
799 /* XXX ondisk32 */
800 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
801 &bi_daddr, NULL);
802 if (error)
803 {
804 blkp->bi_daddr = LFS_UNUSED_DADDR;
805 continue;
806 }
807 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
808 /* Fill in the block size, too */
809 if (blkp->bi_lbn >= 0)
810 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
811 else
812 blkp->bi_size = fs->lfs_bsize;
813 }
814 }
815
816 /*
817 * Finish the old file, if there was one. The presence
818 * of a usable vnode in vp is signaled by a valid v_daddr.
819 */
820 if (v_daddr != LFS_UNUSED_DADDR) {
821 lfs_vunref(vp);
822 numrefed--;
823 }
824
825 #ifdef DEBUG_LFS
826 if (numrefed != 0) {
827 panic("lfs_bmapv: numrefed=%d", numrefed);
828 }
829 #endif
830
831 vfs_unbusy(mntp);
832
833 return 0;
834 }
835
836 /*
837 * sys_lfs_segclean:
838 *
839 * Mark the segment clean.
840 *
841 * 0 on success
842 * -1/errno is return on error.
843 */
844 int
845 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
846 {
847 struct sys_lfs_segclean_args /* {
848 syscallarg(fsid_t *) fsidp;
849 syscallarg(u_long) segment;
850 } */ *uap = v;
851 struct lfs *fs;
852 struct mount *mntp;
853 fsid_t fsid;
854 int error;
855 unsigned long segnum;
856 struct proc *p = l->l_proc;
857
858 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
859 return (error);
860
861 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
862 return (error);
863 if ((mntp = vfs_getvfs(&fsid)) == NULL)
864 return (ENOENT);
865
866 fs = VFSTOUFS(mntp)->um_lfs;
867 segnum = SCARG(uap, segment);
868
869 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
870 return (error);
871
872 lfs_seglock(fs, SEGM_PROT);
873 error = lfs_do_segclean(fs, segnum);
874 lfs_segunlock(fs);
875 vfs_unbusy(mntp);
876 return error;
877 }
878
879 /*
880 * Actually mark the segment clean.
881 * Must be called with the segment lock held.
882 */
883 int
884 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
885 {
886 struct buf *bp;
887 CLEANERINFO *cip;
888 SEGUSE *sup;
889
890 if (dtosn(fs, fs->lfs_curseg) == segnum) {
891 return (EBUSY);
892 }
893
894 LFS_SEGENTRY(sup, fs, segnum, bp);
895 if (sup->su_nbytes) {
896 printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
897 segnum, sup->su_nbytes);
898 brelse(bp);
899 return (EBUSY);
900 }
901 if (sup->su_flags & SEGUSE_ACTIVE) {
902 brelse(bp);
903 return (EBUSY);
904 }
905 if (!(sup->su_flags & SEGUSE_DIRTY)) {
906 brelse(bp);
907 return (EALREADY);
908 }
909
910 fs->lfs_avail += segtod(fs, 1);
911 if (sup->su_flags & SEGUSE_SUPERBLOCK)
912 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
913 if (fs->lfs_version > 1 && segnum == 0 &&
914 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
915 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
916 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
917 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
918 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
919 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
920 if (fs->lfs_dmeta < 0)
921 fs->lfs_dmeta = 0;
922 sup->su_flags &= ~SEGUSE_DIRTY;
923 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
924
925 LFS_CLEANERINFO(cip, fs, bp);
926 ++cip->clean;
927 --cip->dirty;
928 fs->lfs_nclean = cip->clean;
929 cip->bfree = fs->lfs_bfree;
930 cip->avail = fs->lfs_avail - fs->lfs_ravail;
931 (void) LFS_BWRITE_LOG(bp);
932 wakeup(&fs->lfs_avail);
933
934 return (0);
935 }
936
937 /*
938 * This will block until a segment in file system fsid is written. A timeout
939 * in milliseconds may be specified which will awake the cleaner automatically.
940 * An fsid of -1 means any file system, and a timeout of 0 means forever.
941 */
942 int
943 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
944 {
945 struct mount *mntp;
946 void *addr;
947 u_long timeout;
948 int error, s;
949
950 if ((mntp = vfs_getvfs(fsidp)) == NULL)
951 addr = &lfs_allclean_wakeup;
952 else
953 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
954 /*
955 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
956 * XXX IS THAT WHAT IS INTENDED?
957 */
958 s = splclock();
959 timeradd(tv, &time, tv);
960 timeout = hzto(tv);
961 splx(s);
962 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
963 return (error == ERESTART ? EINTR : 0);
964 }
965
966 /*
967 * sys_lfs_segwait:
968 *
969 * System call wrapper around lfs_segwait().
970 *
971 * 0 on success
972 * 1 on timeout
973 * -1/errno is return on error.
974 */
975 int
976 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
977 {
978 struct sys_lfs_segwait_args /* {
979 syscallarg(fsid_t *) fsidp;
980 syscallarg(struct timeval *) tv;
981 } */ *uap = v;
982 struct proc *p = l->l_proc;
983 struct timeval atv;
984 fsid_t fsid;
985 int error;
986
987 /* XXX need we be su to segwait? */
988 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
989 return (error);
990 }
991 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
992 return (error);
993
994 if (SCARG(uap, tv)) {
995 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
996 if (error)
997 return (error);
998 if (itimerfix(&atv))
999 return (EINVAL);
1000 } else /* NULL or invalid */
1001 atv.tv_sec = atv.tv_usec = 0;
1002 return lfs_segwait(&fsid, &atv);
1003 }
1004
1005 /*
1006 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1007 * daddr from the ifile, so don't look it up again. If the cleaner is
1008 * processing IINFO structures, it may have the ondisk inode already, so
1009 * don't go retrieving it again.
1010 *
1011 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1012 * when finished.
1013 */
1014 extern struct lock ufs_hashlock;
1015
1016 int
1017 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1018 {
1019
1020 /*
1021 * This is playing fast and loose. Someone may have the inode
1022 * locked, in which case they are going to be distinctly unhappy
1023 * if we trash something.
1024 */
1025 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1026 if ((*vpp)->v_flag & VXLOCK) {
1027 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n",
1028 ino);
1029 clean_vnlocked++;
1030 #ifdef LFS_EAGAIN_FAIL
1031 return EAGAIN;
1032 #endif
1033 }
1034 if (lfs_vref(*vpp)) {
1035 clean_inlocked++;
1036 return EAGAIN;
1037 }
1038 } else
1039 *vpp = NULL;
1040
1041 return (0);
1042 }
1043
1044 int
1045 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1046 {
1047 struct inode *ip;
1048 struct ufs1_dinode *dip;
1049 struct vnode *vp;
1050 struct ufsmount *ump;
1051 dev_t dev;
1052 int error, retries;
1053 struct buf *bp;
1054 struct lfs *fs;
1055
1056 ump = VFSTOUFS(mp);
1057 dev = ump->um_dev;
1058 fs = ump->um_lfs;
1059
1060 /*
1061 * Wait until the filesystem is fully mounted before allowing vget
1062 * to complete. This prevents possible problems with roll-forward.
1063 */
1064 while (fs->lfs_flags & LFS_NOTYET) {
1065 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0);
1066 }
1067 /*
1068 * This is playing fast and loose. Someone may have the inode
1069 * locked, in which case they are going to be distinctly unhappy
1070 * if we trash something.
1071 */
1072
1073 error = lfs_fasthashget(dev, ino, vpp);
1074 if (error != 0 || *vpp != NULL)
1075 return (error);
1076
1077 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1078 *vpp = NULL;
1079 return (error);
1080 }
1081
1082 do {
1083 error = lfs_fasthashget(dev, ino, vpp);
1084 if (error != 0 || *vpp != NULL) {
1085 ungetnewvnode(vp);
1086 return (error);
1087 }
1088 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1089
1090 /* Allocate new vnode/inode. */
1091 lfs_vcreate(mp, ino, vp);
1092
1093 /*
1094 * Put it onto its hash chain and lock it so that other requests for
1095 * this inode will block if they arrive while we are sleeping waiting
1096 * for old data structures to be purged or for the contents of the
1097 * disk portion of this inode to be read.
1098 */
1099 ip = VTOI(vp);
1100 ufs_ihashins(ip);
1101 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1102
1103 /*
1104 * XXX
1105 * This may not need to be here, logically it should go down with
1106 * the i_devvp initialization.
1107 * Ask Kirk.
1108 */
1109 ip->i_lfs = fs;
1110
1111 /* Read in the disk contents for the inode, copy into the inode. */
1112 if (dinp) {
1113 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1114 if (error) {
1115 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino);
1116 ufs_ihashrem(ip);
1117
1118 /* Unlock and discard unneeded inode. */
1119 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1120 lfs_vunref(vp);
1121 *vpp = NULL;
1122 return (error);
1123 }
1124 if (ip->i_number != ino)
1125 panic("lfs_fastvget: I was fed the wrong inode!");
1126 } else {
1127 retries = 0;
1128 again:
1129 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1130 NOCRED, &bp);
1131 if (error) {
1132 printf("lfs_fastvget: bread failed with %d\n",error);
1133 /*
1134 * The inode does not contain anything useful, so it
1135 * would be misleading to leave it on its hash chain.
1136 * Iput() will return it to the free list.
1137 */
1138 ufs_ihashrem(ip);
1139
1140 /* Unlock and discard unneeded inode. */
1141 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1142 lfs_vunref(vp);
1143 brelse(bp);
1144 *vpp = NULL;
1145 return (error);
1146 }
1147 dip = lfs_ifind(ump->um_lfs, ino, bp);
1148 if (dip == NULL) {
1149 /* Assume write has not completed yet; try again */
1150 bp->b_flags |= B_INVAL;
1151 brelse(bp);
1152 ++retries;
1153 if (retries > LFS_IFIND_RETRIES)
1154 panic("lfs_fastvget: dinode not found");
1155 printf("lfs_fastvget: dinode not found, retrying...\n");
1156 goto again;
1157 }
1158 *ip->i_din.ffs1_din = *dip;
1159 brelse(bp);
1160 }
1161 lfs_vinit(mp, &vp);
1162
1163 *vpp = vp;
1164
1165 KASSERT(VOP_ISLOCKED(vp));
1166 VOP_UNLOCK(vp, 0);
1167
1168 return (0);
1169 }
1170
1171 /*
1172 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1173 */
1174 struct buf *
1175 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1176 {
1177 struct buf *bp;
1178 int error;
1179
1180 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1181
1182 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1183 error = copyin(uaddr, bp->b_data, size);
1184 if (error) {
1185 lfs_freebuf(fs, bp);
1186 return NULL;
1187 }
1188 KDASSERT(bp->b_iodone == lfs_callback);
1189
1190 #if 0
1191 ++fs->lfs_iocount;
1192 #endif
1193 bp->b_bufsize = size;
1194 bp->b_bcount = size;
1195 return (bp);
1196 }
Cache object: 86ba92aa777253fcda560b42f0418391
|