1 /*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
14 * 1614 Oxford Street mckusick@mckusick.com
15 * Berkeley, CA 94709-1608 +1-510-843-9542
16 * USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
40 */
41
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44
45 #include "opt_ffs.h"
46 #include "opt_quota.h"
47 #include "opt_ddb.h"
48
49 /*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52 #ifndef DEBUG
53 #define DEBUG
54 #endif
55
56 #include <sys/param.h>
57 #include <sys/kernel.h>
58 #include <sys/systm.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/kdb.h>
62 #include <sys/kthread.h>
63 #include <sys/ktr.h>
64 #include <sys/limits.h>
65 #include <sys/lock.h>
66 #include <sys/malloc.h>
67 #include <sys/mount.h>
68 #include <sys/mutex.h>
69 #include <sys/namei.h>
70 #include <sys/priv.h>
71 #include <sys/proc.h>
72 #include <sys/racct.h>
73 #include <sys/rwlock.h>
74 #include <sys/stat.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/vnode.h>
78 #include <sys/conf.h>
79
80 #include <ufs/ufs/dir.h>
81 #include <ufs/ufs/extattr.h>
82 #include <ufs/ufs/quota.h>
83 #include <ufs/ufs/inode.h>
84 #include <ufs/ufs/ufsmount.h>
85 #include <ufs/ffs/fs.h>
86 #include <ufs/ffs/softdep.h>
87 #include <ufs/ffs/ffs_extern.h>
88 #include <ufs/ufs/ufs_extern.h>
89
90 #include <vm/vm.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_object.h>
93
94 #include <geom/geom.h>
95
96 #include <ddb/ddb.h>
97
98 #define KTR_SUJ 0 /* Define to KTR_SPARE. */
99
100 #ifndef SOFTUPDATES
101
102 int
103 softdep_flushfiles(oldmnt, flags, td)
104 struct mount *oldmnt;
105 int flags;
106 struct thread *td;
107 {
108
109 panic("softdep_flushfiles called");
110 }
111
112 int
113 softdep_mount(devvp, mp, fs, cred)
114 struct vnode *devvp;
115 struct mount *mp;
116 struct fs *fs;
117 struct ucred *cred;
118 {
119
120 return (0);
121 }
122
123 void
124 softdep_initialize()
125 {
126
127 return;
128 }
129
130 void
131 softdep_uninitialize()
132 {
133
134 return;
135 }
136
137 void
138 softdep_unmount(mp)
139 struct mount *mp;
140 {
141
142 panic("softdep_unmount called");
143 }
144
145 void
146 softdep_setup_sbupdate(ump, fs, bp)
147 struct ufsmount *ump;
148 struct fs *fs;
149 struct buf *bp;
150 {
151
152 panic("softdep_setup_sbupdate called");
153 }
154
155 void
156 softdep_setup_inomapdep(bp, ip, newinum, mode)
157 struct buf *bp;
158 struct inode *ip;
159 ino_t newinum;
160 int mode;
161 {
162
163 panic("softdep_setup_inomapdep called");
164 }
165
166 void
167 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
168 struct buf *bp;
169 struct mount *mp;
170 ufs2_daddr_t newblkno;
171 int frags;
172 int oldfrags;
173 {
174
175 panic("softdep_setup_blkmapdep called");
176 }
177
178 void
179 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
180 struct inode *ip;
181 ufs_lbn_t lbn;
182 ufs2_daddr_t newblkno;
183 ufs2_daddr_t oldblkno;
184 long newsize;
185 long oldsize;
186 struct buf *bp;
187 {
188
189 panic("softdep_setup_allocdirect called");
190 }
191
192 void
193 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
194 struct inode *ip;
195 ufs_lbn_t lbn;
196 ufs2_daddr_t newblkno;
197 ufs2_daddr_t oldblkno;
198 long newsize;
199 long oldsize;
200 struct buf *bp;
201 {
202
203 panic("softdep_setup_allocext called");
204 }
205
206 void
207 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
208 struct inode *ip;
209 ufs_lbn_t lbn;
210 struct buf *bp;
211 int ptrno;
212 ufs2_daddr_t newblkno;
213 ufs2_daddr_t oldblkno;
214 struct buf *nbp;
215 {
216
217 panic("softdep_setup_allocindir_page called");
218 }
219
220 void
221 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
222 struct buf *nbp;
223 struct inode *ip;
224 struct buf *bp;
225 int ptrno;
226 ufs2_daddr_t newblkno;
227 {
228
229 panic("softdep_setup_allocindir_meta called");
230 }
231
232 void
233 softdep_journal_freeblocks(ip, cred, length, flags)
234 struct inode *ip;
235 struct ucred *cred;
236 off_t length;
237 int flags;
238 {
239
240 panic("softdep_journal_freeblocks called");
241 }
242
243 void
244 softdep_journal_fsync(ip)
245 struct inode *ip;
246 {
247
248 panic("softdep_journal_fsync called");
249 }
250
251 void
252 softdep_setup_freeblocks(ip, length, flags)
253 struct inode *ip;
254 off_t length;
255 int flags;
256 {
257
258 panic("softdep_setup_freeblocks called");
259 }
260
261 void
262 softdep_freefile(pvp, ino, mode)
263 struct vnode *pvp;
264 ino_t ino;
265 int mode;
266 {
267
268 panic("softdep_freefile called");
269 }
270
271 int
272 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
273 struct buf *bp;
274 struct inode *dp;
275 off_t diroffset;
276 ino_t newinum;
277 struct buf *newdirbp;
278 int isnewblk;
279 {
280
281 panic("softdep_setup_directory_add called");
282 }
283
284 void
285 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
286 struct buf *bp;
287 struct inode *dp;
288 caddr_t base;
289 caddr_t oldloc;
290 caddr_t newloc;
291 int entrysize;
292 {
293
294 panic("softdep_change_directoryentry_offset called");
295 }
296
297 void
298 softdep_setup_remove(bp, dp, ip, isrmdir)
299 struct buf *bp;
300 struct inode *dp;
301 struct inode *ip;
302 int isrmdir;
303 {
304
305 panic("softdep_setup_remove called");
306 }
307
308 void
309 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
310 struct buf *bp;
311 struct inode *dp;
312 struct inode *ip;
313 ino_t newinum;
314 int isrmdir;
315 {
316
317 panic("softdep_setup_directory_change called");
318 }
319
320 void
321 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
322 struct mount *mp;
323 struct buf *bp;
324 ufs2_daddr_t blkno;
325 int frags;
326 struct workhead *wkhd;
327 {
328
329 panic("%s called", __FUNCTION__);
330 }
331
332 void
333 softdep_setup_inofree(mp, bp, ino, wkhd)
334 struct mount *mp;
335 struct buf *bp;
336 ino_t ino;
337 struct workhead *wkhd;
338 {
339
340 panic("%s called", __FUNCTION__);
341 }
342
343 void
344 softdep_setup_unlink(dp, ip)
345 struct inode *dp;
346 struct inode *ip;
347 {
348
349 panic("%s called", __FUNCTION__);
350 }
351
352 void
353 softdep_setup_link(dp, ip)
354 struct inode *dp;
355 struct inode *ip;
356 {
357
358 panic("%s called", __FUNCTION__);
359 }
360
361 void
362 softdep_revert_link(dp, ip)
363 struct inode *dp;
364 struct inode *ip;
365 {
366
367 panic("%s called", __FUNCTION__);
368 }
369
370 void
371 softdep_setup_rmdir(dp, ip)
372 struct inode *dp;
373 struct inode *ip;
374 {
375
376 panic("%s called", __FUNCTION__);
377 }
378
379 void
380 softdep_revert_rmdir(dp, ip)
381 struct inode *dp;
382 struct inode *ip;
383 {
384
385 panic("%s called", __FUNCTION__);
386 }
387
388 void
389 softdep_setup_create(dp, ip)
390 struct inode *dp;
391 struct inode *ip;
392 {
393
394 panic("%s called", __FUNCTION__);
395 }
396
397 void
398 softdep_revert_create(dp, ip)
399 struct inode *dp;
400 struct inode *ip;
401 {
402
403 panic("%s called", __FUNCTION__);
404 }
405
406 void
407 softdep_setup_mkdir(dp, ip)
408 struct inode *dp;
409 struct inode *ip;
410 {
411
412 panic("%s called", __FUNCTION__);
413 }
414
415 void
416 softdep_revert_mkdir(dp, ip)
417 struct inode *dp;
418 struct inode *ip;
419 {
420
421 panic("%s called", __FUNCTION__);
422 }
423
424 void
425 softdep_setup_dotdot_link(dp, ip)
426 struct inode *dp;
427 struct inode *ip;
428 {
429
430 panic("%s called", __FUNCTION__);
431 }
432
433 int
434 softdep_prealloc(vp, waitok)
435 struct vnode *vp;
436 int waitok;
437 {
438
439 panic("%s called", __FUNCTION__);
440 }
441
442 int
443 softdep_journal_lookup(mp, vpp)
444 struct mount *mp;
445 struct vnode **vpp;
446 {
447
448 return (ENOENT);
449 }
450
451 void
452 softdep_change_linkcnt(ip)
453 struct inode *ip;
454 {
455
456 panic("softdep_change_linkcnt called");
457 }
458
459 void
460 softdep_load_inodeblock(ip)
461 struct inode *ip;
462 {
463
464 panic("softdep_load_inodeblock called");
465 }
466
467 void
468 softdep_update_inodeblock(ip, bp, waitfor)
469 struct inode *ip;
470 struct buf *bp;
471 int waitfor;
472 {
473
474 panic("softdep_update_inodeblock called");
475 }
476
477 int
478 softdep_fsync(vp)
479 struct vnode *vp; /* the "in_core" copy of the inode */
480 {
481
482 return (0);
483 }
484
485 void
486 softdep_fsync_mountdev(vp)
487 struct vnode *vp;
488 {
489
490 return;
491 }
492
493 int
494 softdep_flushworklist(oldmnt, countp, td)
495 struct mount *oldmnt;
496 int *countp;
497 struct thread *td;
498 {
499
500 *countp = 0;
501 return (0);
502 }
503
504 int
505 softdep_sync_metadata(struct vnode *vp)
506 {
507
508 panic("softdep_sync_metadata called");
509 }
510
511 int
512 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
513 {
514
515 panic("softdep_sync_buf called");
516 }
517
518 int
519 softdep_slowdown(vp)
520 struct vnode *vp;
521 {
522
523 panic("softdep_slowdown called");
524 }
525
526 int
527 softdep_request_cleanup(fs, vp, cred, resource)
528 struct fs *fs;
529 struct vnode *vp;
530 struct ucred *cred;
531 int resource;
532 {
533
534 return (0);
535 }
536
537 int
538 softdep_check_suspend(struct mount *mp,
539 struct vnode *devvp,
540 int softdep_depcnt,
541 int softdep_accdepcnt,
542 int secondary_writes,
543 int secondary_accwrites)
544 {
545 struct bufobj *bo;
546 int error;
547
548 (void) softdep_depcnt,
549 (void) softdep_accdepcnt;
550
551 bo = &devvp->v_bufobj;
552 ASSERT_BO_WLOCKED(bo);
553
554 MNT_ILOCK(mp);
555 while (mp->mnt_secondary_writes != 0) {
556 BO_UNLOCK(bo);
557 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
558 (PUSER - 1) | PDROP, "secwr", 0);
559 BO_LOCK(bo);
560 MNT_ILOCK(mp);
561 }
562
563 /*
564 * Reasons for needing more work before suspend:
565 * - Dirty buffers on devvp.
566 * - Secondary writes occurred after start of vnode sync loop
567 */
568 error = 0;
569 if (bo->bo_numoutput > 0 ||
570 bo->bo_dirty.bv_cnt > 0 ||
571 secondary_writes != 0 ||
572 mp->mnt_secondary_writes != 0 ||
573 secondary_accwrites != mp->mnt_secondary_accwrites)
574 error = EAGAIN;
575 BO_UNLOCK(bo);
576 return (error);
577 }
578
579 void
580 softdep_get_depcounts(struct mount *mp,
581 int *softdepactivep,
582 int *softdepactiveaccp)
583 {
584 (void) mp;
585 *softdepactivep = 0;
586 *softdepactiveaccp = 0;
587 }
588
589 void
590 softdep_buf_append(bp, wkhd)
591 struct buf *bp;
592 struct workhead *wkhd;
593 {
594
595 panic("softdep_buf_appendwork called");
596 }
597
598 void
599 softdep_inode_append(ip, cred, wkhd)
600 struct inode *ip;
601 struct ucred *cred;
602 struct workhead *wkhd;
603 {
604
605 panic("softdep_inode_appendwork called");
606 }
607
608 void
609 softdep_freework(wkhd)
610 struct workhead *wkhd;
611 {
612
613 panic("softdep_freework called");
614 }
615
616 #else
617
618 FEATURE(softupdates, "FFS soft-updates support");
619
620 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
621 "soft updates stats");
622 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
623 "total dependencies allocated");
624 static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
625 "high use dependencies allocated");
626 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
627 "current dependencies allocated");
628 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
629 "current dependencies written");
630
631 unsigned long dep_current[D_LAST + 1];
632 unsigned long dep_highuse[D_LAST + 1];
633 unsigned long dep_total[D_LAST + 1];
634 unsigned long dep_write[D_LAST + 1];
635
636 #define SOFTDEP_TYPE(type, str, long) \
637 static MALLOC_DEFINE(M_ ## type, #str, long); \
638 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \
639 &dep_total[D_ ## type], 0, ""); \
640 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \
641 &dep_current[D_ ## type], 0, ""); \
642 SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \
643 &dep_highuse[D_ ## type], 0, ""); \
644 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \
645 &dep_write[D_ ## type], 0, "");
646
647 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
648 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
649 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
650 "Block or frag allocated from cyl group map");
651 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
652 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
653 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
654 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
655 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
656 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
657 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
658 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
659 SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
660 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
661 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
662 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
663 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
664 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
665 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
666 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
667 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
668 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
669 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
670 SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
671 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
672 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
673 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
674 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
675
676 static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
677
678 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
679 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
680 static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
681
682 #define M_SOFTDEP_FLAGS (M_WAITOK)
683
684 /*
685 * translate from workitem type to memory type
686 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
687 */
688 static struct malloc_type *memtype[] = {
689 M_PAGEDEP,
690 M_INODEDEP,
691 M_BMSAFEMAP,
692 M_NEWBLK,
693 M_ALLOCDIRECT,
694 M_INDIRDEP,
695 M_ALLOCINDIR,
696 M_FREEFRAG,
697 M_FREEBLKS,
698 M_FREEFILE,
699 M_DIRADD,
700 M_MKDIR,
701 M_DIRREM,
702 M_NEWDIRBLK,
703 M_FREEWORK,
704 M_FREEDEP,
705 M_JADDREF,
706 M_JREMREF,
707 M_JMVREF,
708 M_JNEWBLK,
709 M_JFREEBLK,
710 M_JFREEFRAG,
711 M_JSEG,
712 M_JSEGDEP,
713 M_SBDEP,
714 M_JTRUNC,
715 M_JFSYNC,
716 M_SENTINEL
717 };
718
719 #define DtoM(type) (memtype[type])
720
721 /*
722 * Names of malloc types.
723 */
724 #define TYPENAME(type) \
725 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
726 /*
727 * End system adaptation definitions.
728 */
729
730 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)
731 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)
732
733 /*
734 * Internal function prototypes.
735 */
736 static void check_clear_deps(struct mount *);
737 static void softdep_error(char *, int);
738 static int softdep_process_worklist(struct mount *, int);
739 static int softdep_waitidle(struct mount *, int);
740 static void drain_output(struct vnode *);
741 static struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
742 static int check_inodedep_free(struct inodedep *);
743 static void clear_remove(struct mount *);
744 static void clear_inodedeps(struct mount *);
745 static void unlinked_inodedep(struct mount *, struct inodedep *);
746 static void clear_unlinked_inodedep(struct inodedep *);
747 static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
748 static int flush_pagedep_deps(struct vnode *, struct mount *,
749 struct diraddhd *);
750 static int free_pagedep(struct pagedep *);
751 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
752 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
753 static int flush_deplist(struct allocdirectlst *, int, int *);
754 static int sync_cgs(struct mount *, int);
755 static int handle_written_filepage(struct pagedep *, struct buf *, int);
756 static int handle_written_sbdep(struct sbdep *, struct buf *);
757 static void initiate_write_sbdep(struct sbdep *);
758 static void diradd_inode_written(struct diradd *, struct inodedep *);
759 static int handle_written_indirdep(struct indirdep *, struct buf *,
760 struct buf**, int);
761 static int handle_written_inodeblock(struct inodedep *, struct buf *, int);
762 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
763 uint8_t *);
764 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
765 static void handle_written_jaddref(struct jaddref *);
766 static void handle_written_jremref(struct jremref *);
767 static void handle_written_jseg(struct jseg *, struct buf *);
768 static void handle_written_jnewblk(struct jnewblk *);
769 static void handle_written_jblkdep(struct jblkdep *);
770 static void handle_written_jfreefrag(struct jfreefrag *);
771 static void complete_jseg(struct jseg *);
772 static void complete_jsegs(struct jseg *);
773 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
774 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
775 static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
776 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
777 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
778 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
779 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
780 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
781 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
782 static inline void inoref_write(struct inoref *, struct jseg *,
783 struct jrefrec *);
784 static void handle_allocdirect_partdone(struct allocdirect *,
785 struct workhead *);
786 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
787 struct workhead *);
788 static void indirdep_complete(struct indirdep *);
789 static int indirblk_lookup(struct mount *, ufs2_daddr_t);
790 static void indirblk_insert(struct freework *);
791 static void indirblk_remove(struct freework *);
792 static void handle_allocindir_partdone(struct allocindir *);
793 static void initiate_write_filepage(struct pagedep *, struct buf *);
794 static void initiate_write_indirdep(struct indirdep*, struct buf *);
795 static void handle_written_mkdir(struct mkdir *, int);
796 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
797 uint8_t *);
798 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
799 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
800 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
801 static void handle_workitem_freefile(struct freefile *);
802 static int handle_workitem_remove(struct dirrem *, int);
803 static struct dirrem *newdirrem(struct buf *, struct inode *,
804 struct inode *, int, struct dirrem **);
805 static struct indirdep *indirdep_lookup(struct mount *, struct inode *,
806 struct buf *);
807 static void cancel_indirdep(struct indirdep *, struct buf *,
808 struct freeblks *);
809 static void free_indirdep(struct indirdep *);
810 static void free_diradd(struct diradd *, struct workhead *);
811 static void merge_diradd(struct inodedep *, struct diradd *);
812 static void complete_diradd(struct diradd *);
813 static struct diradd *diradd_lookup(struct pagedep *, int);
814 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
815 struct jremref *);
816 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
817 struct jremref *);
818 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
819 struct jremref *, struct jremref *);
820 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
821 struct jremref *);
822 static void cancel_allocindir(struct allocindir *, struct buf *bp,
823 struct freeblks *, int);
824 static int setup_trunc_indir(struct freeblks *, struct inode *,
825 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
826 static void complete_trunc_indir(struct freework *);
827 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
828 int);
829 static void complete_mkdir(struct mkdir *);
830 static void free_newdirblk(struct newdirblk *);
831 static void free_jremref(struct jremref *);
832 static void free_jaddref(struct jaddref *);
833 static void free_jsegdep(struct jsegdep *);
834 static void free_jsegs(struct jblocks *);
835 static void rele_jseg(struct jseg *);
836 static void free_jseg(struct jseg *, struct jblocks *);
837 static void free_jnewblk(struct jnewblk *);
838 static void free_jblkdep(struct jblkdep *);
839 static void free_jfreefrag(struct jfreefrag *);
840 static void free_freedep(struct freedep *);
841 static void journal_jremref(struct dirrem *, struct jremref *,
842 struct inodedep *);
843 static void cancel_jnewblk(struct jnewblk *, struct workhead *);
844 static int cancel_jaddref(struct jaddref *, struct inodedep *,
845 struct workhead *);
846 static void cancel_jfreefrag(struct jfreefrag *);
847 static inline void setup_freedirect(struct freeblks *, struct inode *,
848 int, int);
849 static inline void setup_freeext(struct freeblks *, struct inode *, int, int);
850 static inline void setup_freeindir(struct freeblks *, struct inode *, int,
851 ufs_lbn_t, int);
852 static inline struct freeblks *newfreeblks(struct mount *, struct inode *);
853 static void freeblks_free(struct ufsmount *, struct freeblks *, int);
854 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
855 static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
856 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
857 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
858 int, int);
859 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
860 static int cancel_pagedep(struct pagedep *, struct freeblks *, int);
861 static int deallocate_dependencies(struct buf *, struct freeblks *, int);
862 static void newblk_freefrag(struct newblk*);
863 static void free_newblk(struct newblk *);
864 static void cancel_allocdirect(struct allocdirectlst *,
865 struct allocdirect *, struct freeblks *);
866 static int check_inode_unwritten(struct inodedep *);
867 static int free_inodedep(struct inodedep *);
868 static void freework_freeblock(struct freework *);
869 static void freework_enqueue(struct freework *);
870 static int handle_workitem_freeblocks(struct freeblks *, int);
871 static int handle_complete_freeblocks(struct freeblks *, int);
872 static void handle_workitem_indirblk(struct freework *);
873 static void handle_written_freework(struct freework *);
874 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
875 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
876 struct workhead *);
877 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
878 struct inodedep *, struct allocindir *, ufs_lbn_t);
879 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
880 ufs2_daddr_t, ufs_lbn_t);
881 static void handle_workitem_freefrag(struct freefrag *);
882 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
883 ufs_lbn_t);
884 static void allocdirect_merge(struct allocdirectlst *,
885 struct allocdirect *, struct allocdirect *);
886 static struct freefrag *allocindir_merge(struct allocindir *,
887 struct allocindir *);
888 static int bmsafemap_find(struct bmsafemap_hashhead *, int,
889 struct bmsafemap **);
890 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
891 int cg, struct bmsafemap *);
892 static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
893 struct newblk **);
894 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
895 static int inodedep_find(struct inodedep_hashhead *, ino_t,
896 struct inodedep **);
897 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
898 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
899 int, struct pagedep **);
900 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
901 struct pagedep **);
902 static void pause_timer(void *);
903 static int request_cleanup(struct mount *, int);
904 static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
905 static void schedule_cleanup(struct mount *);
906 static void softdep_ast_cleanup_proc(struct thread *);
907 static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
908 static int process_worklist_item(struct mount *, int, int);
909 static void process_removes(struct vnode *);
910 static void process_truncates(struct vnode *);
911 static void jwork_move(struct workhead *, struct workhead *);
912 static void jwork_insert(struct workhead *, struct jsegdep *);
913 static void add_to_worklist(struct worklist *, int);
914 static void wake_worklist(struct worklist *);
915 static void wait_worklist(struct worklist *, char *);
916 static void remove_from_worklist(struct worklist *);
917 static void softdep_flush(void *);
918 static void softdep_flushjournal(struct mount *);
919 static int softdep_speedup(struct ufsmount *);
920 static void worklist_speedup(struct mount *);
921 static int journal_mount(struct mount *, struct fs *, struct ucred *);
922 static void journal_unmount(struct ufsmount *);
923 static int journal_space(struct ufsmount *, int);
924 static void journal_suspend(struct ufsmount *);
925 static int journal_unsuspend(struct ufsmount *ump);
926 static void softdep_prelink(struct vnode *, struct vnode *);
927 static void add_to_journal(struct worklist *);
928 static void remove_from_journal(struct worklist *);
929 static bool softdep_excess_items(struct ufsmount *, int);
930 static void softdep_process_journal(struct mount *, struct worklist *, int);
931 static struct jremref *newjremref(struct dirrem *, struct inode *,
932 struct inode *ip, off_t, nlink_t);
933 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
934 uint16_t);
935 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
936 uint16_t);
937 static inline struct jsegdep *inoref_jseg(struct inoref *);
938 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
939 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
940 ufs2_daddr_t, int);
941 static void adjust_newfreework(struct freeblks *, int);
942 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
943 static void move_newblock_dep(struct jaddref *, struct inodedep *);
944 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
945 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
946 ufs2_daddr_t, long, ufs_lbn_t);
947 static struct freework *newfreework(struct ufsmount *, struct freeblks *,
948 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
949 static int jwait(struct worklist *, int);
950 static struct inodedep *inodedep_lookup_ip(struct inode *);
951 static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
952 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
953 static void handle_jwork(struct workhead *);
954 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
955 struct mkdir **);
956 static struct jblocks *jblocks_create(void);
957 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
958 static void jblocks_free(struct jblocks *, struct mount *, int);
959 static void jblocks_destroy(struct jblocks *);
960 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
961
962 /*
963 * Exported softdep operations.
964 */
965 static void softdep_disk_io_initiation(struct buf *);
966 static void softdep_disk_write_complete(struct buf *);
967 static void softdep_deallocate_dependencies(struct buf *);
968 static int softdep_count_dependencies(struct buf *bp, int);
969
970 /*
971 * Global lock over all of soft updates.
972 */
973 static struct mtx lk;
974 MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
975
976 #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk)
977 #define FREE_GBLLOCK(lk) mtx_unlock(lk)
978 #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED)
979
980 /*
981 * Per-filesystem soft-updates locking.
982 */
983 #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock)
984 #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock)
985 #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock)
986 #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock)
987 #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \
988 RA_WLOCKED)
989
990 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock)
991 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock)
992
993 /*
994 * Worklist queue management.
995 * These routines require that the lock be held.
996 */
997 #ifndef /* NOT */ DEBUG
998 #define WORKLIST_INSERT(head, item) do { \
999 (item)->wk_state |= ONWORKLIST; \
1000 LIST_INSERT_HEAD(head, item, wk_list); \
1001 } while (0)
1002 #define WORKLIST_REMOVE(item) do { \
1003 (item)->wk_state &= ~ONWORKLIST; \
1004 LIST_REMOVE(item, wk_list); \
1005 } while (0)
1006 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT
1007 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE
1008
1009 #else /* DEBUG */
1010 static void worklist_insert(struct workhead *, struct worklist *, int);
1011 static void worklist_remove(struct worklist *, int);
1012
1013 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1014 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1015 #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1016 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1017
1018 static void
1019 worklist_insert(head, item, locked)
1020 struct workhead *head;
1021 struct worklist *item;
1022 int locked;
1023 {
1024
1025 if (locked)
1026 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1027 if (item->wk_state & ONWORKLIST)
1028 panic("worklist_insert: %p %s(0x%X) already on list",
1029 item, TYPENAME(item->wk_type), item->wk_state);
1030 item->wk_state |= ONWORKLIST;
1031 LIST_INSERT_HEAD(head, item, wk_list);
1032 }
1033
1034 static void
1035 worklist_remove(item, locked)
1036 struct worklist *item;
1037 int locked;
1038 {
1039
1040 if (locked)
1041 LOCK_OWNED(VFSTOUFS(item->wk_mp));
1042 if ((item->wk_state & ONWORKLIST) == 0)
1043 panic("worklist_remove: %p %s(0x%X) not on list",
1044 item, TYPENAME(item->wk_type), item->wk_state);
1045 item->wk_state &= ~ONWORKLIST;
1046 LIST_REMOVE(item, wk_list);
1047 }
1048 #endif /* DEBUG */
1049
1050 /*
1051 * Merge two jsegdeps keeping only the oldest one as newer references
1052 * can't be discarded until after older references.
1053 */
1054 static inline struct jsegdep *
1055 jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1056 {
1057 struct jsegdep *swp;
1058
1059 if (two == NULL)
1060 return (one);
1061
1062 if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1063 swp = one;
1064 one = two;
1065 two = swp;
1066 }
1067 WORKLIST_REMOVE(&two->jd_list);
1068 free_jsegdep(two);
1069
1070 return (one);
1071 }
1072
1073 /*
1074 * If two freedeps are compatible free one to reduce list size.
1075 */
1076 static inline struct freedep *
1077 freedep_merge(struct freedep *one, struct freedep *two)
1078 {
1079 if (two == NULL)
1080 return (one);
1081
1082 if (one->fd_freework == two->fd_freework) {
1083 WORKLIST_REMOVE(&two->fd_list);
1084 free_freedep(two);
1085 }
1086 return (one);
1087 }
1088
1089 /*
1090 * Move journal work from one list to another. Duplicate freedeps and
1091 * jsegdeps are coalesced to keep the lists as small as possible.
1092 */
1093 static void
1094 jwork_move(dst, src)
1095 struct workhead *dst;
1096 struct workhead *src;
1097 {
1098 struct freedep *freedep;
1099 struct jsegdep *jsegdep;
1100 struct worklist *wkn;
1101 struct worklist *wk;
1102
1103 KASSERT(dst != src,
1104 ("jwork_move: dst == src"));
1105 freedep = NULL;
1106 jsegdep = NULL;
1107 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1108 if (wk->wk_type == D_JSEGDEP)
1109 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1110 else if (wk->wk_type == D_FREEDEP)
1111 freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1112 }
1113
1114 while ((wk = LIST_FIRST(src)) != NULL) {
1115 WORKLIST_REMOVE(wk);
1116 WORKLIST_INSERT(dst, wk);
1117 if (wk->wk_type == D_JSEGDEP) {
1118 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1119 continue;
1120 }
1121 if (wk->wk_type == D_FREEDEP)
1122 freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1123 }
1124 }
1125
1126 static void
1127 jwork_insert(dst, jsegdep)
1128 struct workhead *dst;
1129 struct jsegdep *jsegdep;
1130 {
1131 struct jsegdep *jsegdepn;
1132 struct worklist *wk;
1133
1134 LIST_FOREACH(wk, dst, wk_list)
1135 if (wk->wk_type == D_JSEGDEP)
1136 break;
1137 if (wk == NULL) {
1138 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1139 return;
1140 }
1141 jsegdepn = WK_JSEGDEP(wk);
1142 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1143 WORKLIST_REMOVE(wk);
1144 free_jsegdep(jsegdepn);
1145 WORKLIST_INSERT(dst, &jsegdep->jd_list);
1146 } else
1147 free_jsegdep(jsegdep);
1148 }
1149
1150 /*
1151 * Routines for tracking and managing workitems.
1152 */
1153 static void workitem_free(struct worklist *, int);
1154 static void workitem_alloc(struct worklist *, int, struct mount *);
1155 static void workitem_reassign(struct worklist *, int);
1156
1157 #define WORKITEM_FREE(item, type) \
1158 workitem_free((struct worklist *)(item), (type))
1159 #define WORKITEM_REASSIGN(item, type) \
1160 workitem_reassign((struct worklist *)(item), (type))
1161
1162 static void
1163 workitem_free(item, type)
1164 struct worklist *item;
1165 int type;
1166 {
1167 struct ufsmount *ump;
1168
1169 #ifdef DEBUG
1170 if (item->wk_state & ONWORKLIST)
1171 panic("workitem_free: %s(0x%X) still on list",
1172 TYPENAME(item->wk_type), item->wk_state);
1173 if (item->wk_type != type && type != D_NEWBLK)
1174 panic("workitem_free: type mismatch %s != %s",
1175 TYPENAME(item->wk_type), TYPENAME(type));
1176 #endif
1177 if (item->wk_state & IOWAITING)
1178 wakeup(item);
1179 ump = VFSTOUFS(item->wk_mp);
1180 LOCK_OWNED(ump);
1181 KASSERT(ump->softdep_deps > 0,
1182 ("workitem_free: %s: softdep_deps going negative",
1183 ump->um_fs->fs_fsmnt));
1184 if (--ump->softdep_deps == 0 && ump->softdep_req)
1185 wakeup(&ump->softdep_deps);
1186 KASSERT(dep_current[item->wk_type] > 0,
1187 ("workitem_free: %s: dep_current[%s] going negative",
1188 ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1189 KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1190 ("workitem_free: %s: softdep_curdeps[%s] going negative",
1191 ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1192 atomic_subtract_long(&dep_current[item->wk_type], 1);
1193 ump->softdep_curdeps[item->wk_type] -= 1;
1194 free(item, DtoM(type));
1195 }
1196
1197 static void
1198 workitem_alloc(item, type, mp)
1199 struct worklist *item;
1200 int type;
1201 struct mount *mp;
1202 {
1203 struct ufsmount *ump;
1204
1205 item->wk_type = type;
1206 item->wk_mp = mp;
1207 item->wk_state = 0;
1208
1209 ump = VFSTOUFS(mp);
1210 ACQUIRE_GBLLOCK(&lk);
1211 dep_current[type]++;
1212 if (dep_current[type] > dep_highuse[type])
1213 dep_highuse[type] = dep_current[type];
1214 dep_total[type]++;
1215 FREE_GBLLOCK(&lk);
1216 ACQUIRE_LOCK(ump);
1217 ump->softdep_curdeps[type] += 1;
1218 ump->softdep_deps++;
1219 ump->softdep_accdeps++;
1220 FREE_LOCK(ump);
1221 }
1222
1223 static void
1224 workitem_reassign(item, newtype)
1225 struct worklist *item;
1226 int newtype;
1227 {
1228 struct ufsmount *ump;
1229
1230 ump = VFSTOUFS(item->wk_mp);
1231 LOCK_OWNED(ump);
1232 KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1233 ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1234 VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1235 ump->softdep_curdeps[item->wk_type] -= 1;
1236 ump->softdep_curdeps[newtype] += 1;
1237 KASSERT(dep_current[item->wk_type] > 0,
1238 ("workitem_reassign: %s: dep_current[%s] going negative",
1239 VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1240 ACQUIRE_GBLLOCK(&lk);
1241 dep_current[newtype]++;
1242 dep_current[item->wk_type]--;
1243 if (dep_current[newtype] > dep_highuse[newtype])
1244 dep_highuse[newtype] = dep_current[newtype];
1245 dep_total[newtype]++;
1246 FREE_GBLLOCK(&lk);
1247 item->wk_type = newtype;
1248 }
1249
1250 /*
1251 * Workitem queue management
1252 */
1253 static int max_softdeps; /* maximum number of structs before slowdown */
1254 static int tickdelay = 2; /* number of ticks to pause during slowdown */
1255 static int proc_waiting; /* tracks whether we have a timeout posted */
1256 static int *stat_countp; /* statistic to count in proc_waiting timeout */
1257 static struct callout softdep_callout;
1258 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
1259 static int req_clear_remove; /* syncer process flush some freeblks */
1260 static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1261
1262 /*
1263 * runtime statistics
1264 */
1265 static int stat_flush_threads; /* number of softdep flushing threads */
1266 static int stat_worklist_push; /* number of worklist cleanups */
1267 static int stat_blk_limit_push; /* number of times block limit neared */
1268 static int stat_ino_limit_push; /* number of times inode limit neared */
1269 static int stat_blk_limit_hit; /* number of times block slowdown imposed */
1270 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
1271 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
1272 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
1273 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
1274 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1275 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
1276 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */
1277 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */
1278 static int stat_journal_min; /* Times hit journal min threshold */
1279 static int stat_journal_low; /* Times hit journal low threshold */
1280 static int stat_journal_wait; /* Times blocked in jwait(). */
1281 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
1282 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
1283 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */
1284 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */
1285 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1286 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1287 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1288 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1289 static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1290 static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1291
1292 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1293 &max_softdeps, 0, "");
1294 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1295 &tickdelay, 0, "");
1296 SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1297 &stat_flush_threads, 0, "");
1298 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1299 &stat_worklist_push, 0,"");
1300 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1301 &stat_blk_limit_push, 0,"");
1302 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1303 &stat_ino_limit_push, 0,"");
1304 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1305 &stat_blk_limit_hit, 0, "");
1306 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1307 &stat_ino_limit_hit, 0, "");
1308 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1309 &stat_sync_limit_hit, 0, "");
1310 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1311 &stat_indir_blk_ptrs, 0, "");
1312 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1313 &stat_inode_bitmap, 0, "");
1314 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1315 &stat_direct_blk_ptrs, 0, "");
1316 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1317 &stat_dir_entry, 0, "");
1318 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1319 &stat_jaddref, 0, "");
1320 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1321 &stat_jnewblk, 0, "");
1322 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1323 &stat_journal_low, 0, "");
1324 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1325 &stat_journal_min, 0, "");
1326 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1327 &stat_journal_wait, 0, "");
1328 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1329 &stat_jwait_filepage, 0, "");
1330 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1331 &stat_jwait_freeblks, 0, "");
1332 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1333 &stat_jwait_inode, 0, "");
1334 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1335 &stat_jwait_newblk, 0, "");
1336 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1337 &stat_cleanup_blkrequests, 0, "");
1338 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1339 &stat_cleanup_inorequests, 0, "");
1340 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1341 &stat_cleanup_high_delay, 0, "");
1342 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1343 &stat_cleanup_retries, 0, "");
1344 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1345 &stat_cleanup_failures, 0, "");
1346 SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1347 &softdep_flushcache, 0, "");
1348 SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1349 &stat_emptyjblocks, 0, "");
1350
1351 SYSCTL_DECL(_vfs_ffs);
1352
1353 /* Whether to recompute the summary at mount time */
1354 static int compute_summary_at_mount = 0;
1355 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1356 &compute_summary_at_mount, 0, "Recompute summary at mount");
1357 static int print_threads = 0;
1358 SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1359 &print_threads, 0, "Notify flusher thread start/stop");
1360
1361 /* List of all filesystems mounted with soft updates */
1362 static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1363
1364 /*
1365 * This function cleans the worklist for a filesystem.
1366 * Each filesystem running with soft dependencies gets its own
1367 * thread to run in this function. The thread is started up in
1368 * softdep_mount and shutdown in softdep_unmount. They show up
1369 * as part of the kernel "bufdaemon" process whose process
1370 * entry is available in bufdaemonproc.
1371 */
1372 static int searchfailed;
1373 extern struct proc *bufdaemonproc;
1374 static void
1375 softdep_flush(addr)
1376 void *addr;
1377 {
1378 struct mount *mp;
1379 struct thread *td;
1380 struct ufsmount *ump;
1381
1382 td = curthread;
1383 td->td_pflags |= TDP_NORUNNINGBUF;
1384 mp = (struct mount *)addr;
1385 ump = VFSTOUFS(mp);
1386 atomic_add_int(&stat_flush_threads, 1);
1387 ACQUIRE_LOCK(ump);
1388 ump->softdep_flags &= ~FLUSH_STARTING;
1389 wakeup(&ump->softdep_flushtd);
1390 FREE_LOCK(ump);
1391 if (print_threads) {
1392 if (stat_flush_threads == 1)
1393 printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1394 bufdaemonproc->p_pid);
1395 printf("Start thread %s\n", td->td_name);
1396 }
1397 for (;;) {
1398 while (softdep_process_worklist(mp, 0) > 0 ||
1399 (MOUNTEDSUJ(mp) &&
1400 VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1401 kthread_suspend_check();
1402 ACQUIRE_LOCK(ump);
1403 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1404 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1405 "sdflush", hz / 2);
1406 ump->softdep_flags &= ~FLUSH_CLEANUP;
1407 /*
1408 * Check to see if we are done and need to exit.
1409 */
1410 if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1411 FREE_LOCK(ump);
1412 continue;
1413 }
1414 ump->softdep_flags &= ~FLUSH_EXIT;
1415 FREE_LOCK(ump);
1416 wakeup(&ump->softdep_flags);
1417 if (print_threads)
1418 printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1419 atomic_subtract_int(&stat_flush_threads, 1);
1420 kthread_exit();
1421 panic("kthread_exit failed\n");
1422 }
1423 }
1424
1425 static void
1426 worklist_speedup(mp)
1427 struct mount *mp;
1428 {
1429 struct ufsmount *ump;
1430
1431 ump = VFSTOUFS(mp);
1432 LOCK_OWNED(ump);
1433 if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1434 ump->softdep_flags |= FLUSH_CLEANUP;
1435 wakeup(&ump->softdep_flushtd);
1436 }
1437
1438 static int
1439 softdep_speedup(ump)
1440 struct ufsmount *ump;
1441 {
1442 struct ufsmount *altump;
1443 struct mount_softdeps *sdp;
1444
1445 LOCK_OWNED(ump);
1446 worklist_speedup(ump->um_mountp);
1447 bd_speedup();
1448 /*
1449 * If we have global shortages, then we need other
1450 * filesystems to help with the cleanup. Here we wakeup a
1451 * flusher thread for a filesystem that is over its fair
1452 * share of resources.
1453 */
1454 if (req_clear_inodedeps || req_clear_remove) {
1455 ACQUIRE_GBLLOCK(&lk);
1456 TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1457 if ((altump = sdp->sd_ump) == ump)
1458 continue;
1459 if (((req_clear_inodedeps &&
1460 altump->softdep_curdeps[D_INODEDEP] >
1461 max_softdeps / stat_flush_threads) ||
1462 (req_clear_remove &&
1463 altump->softdep_curdeps[D_DIRREM] >
1464 (max_softdeps / 2) / stat_flush_threads)) &&
1465 TRY_ACQUIRE_LOCK(altump))
1466 break;
1467 }
1468 if (sdp == NULL) {
1469 searchfailed++;
1470 FREE_GBLLOCK(&lk);
1471 } else {
1472 /*
1473 * Move to the end of the list so we pick a
1474 * different one on out next try.
1475 */
1476 TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1477 TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1478 FREE_GBLLOCK(&lk);
1479 if ((altump->softdep_flags &
1480 (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1481 altump->softdep_flags |= FLUSH_CLEANUP;
1482 altump->um_softdep->sd_cleanups++;
1483 wakeup(&altump->softdep_flushtd);
1484 FREE_LOCK(altump);
1485 }
1486 }
1487 return (speedup_syncer());
1488 }
1489
1490 /*
1491 * Add an item to the end of the work queue.
1492 * This routine requires that the lock be held.
1493 * This is the only routine that adds items to the list.
1494 * The following routine is the only one that removes items
1495 * and does so in order from first to last.
1496 */
1497
1498 #define WK_HEAD 0x0001 /* Add to HEAD. */
1499 #define WK_NODELAY 0x0002 /* Process immediately. */
1500
1501 static void
1502 add_to_worklist(wk, flags)
1503 struct worklist *wk;
1504 int flags;
1505 {
1506 struct ufsmount *ump;
1507
1508 ump = VFSTOUFS(wk->wk_mp);
1509 LOCK_OWNED(ump);
1510 if (wk->wk_state & ONWORKLIST)
1511 panic("add_to_worklist: %s(0x%X) already on list",
1512 TYPENAME(wk->wk_type), wk->wk_state);
1513 wk->wk_state |= ONWORKLIST;
1514 if (ump->softdep_on_worklist == 0) {
1515 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1516 ump->softdep_worklist_tail = wk;
1517 } else if (flags & WK_HEAD) {
1518 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1519 } else {
1520 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1521 ump->softdep_worklist_tail = wk;
1522 }
1523 ump->softdep_on_worklist += 1;
1524 if (flags & WK_NODELAY)
1525 worklist_speedup(wk->wk_mp);
1526 }
1527
1528 /*
1529 * Remove the item to be processed. If we are removing the last
1530 * item on the list, we need to recalculate the tail pointer.
1531 */
1532 static void
1533 remove_from_worklist(wk)
1534 struct worklist *wk;
1535 {
1536 struct ufsmount *ump;
1537
1538 ump = VFSTOUFS(wk->wk_mp);
1539 if (ump->softdep_worklist_tail == wk)
1540 ump->softdep_worklist_tail =
1541 (struct worklist *)wk->wk_list.le_prev;
1542 WORKLIST_REMOVE(wk);
1543 ump->softdep_on_worklist -= 1;
1544 }
1545
1546 static void
1547 wake_worklist(wk)
1548 struct worklist *wk;
1549 {
1550 if (wk->wk_state & IOWAITING) {
1551 wk->wk_state &= ~IOWAITING;
1552 wakeup(wk);
1553 }
1554 }
1555
1556 static void
1557 wait_worklist(wk, wmesg)
1558 struct worklist *wk;
1559 char *wmesg;
1560 {
1561 struct ufsmount *ump;
1562
1563 ump = VFSTOUFS(wk->wk_mp);
1564 wk->wk_state |= IOWAITING;
1565 msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1566 }
1567
1568 /*
1569 * Process that runs once per second to handle items in the background queue.
1570 *
1571 * Note that we ensure that everything is done in the order in which they
1572 * appear in the queue. The code below depends on this property to ensure
1573 * that blocks of a file are freed before the inode itself is freed. This
1574 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1575 * until all the old ones have been purged from the dependency lists.
1576 */
1577 static int
1578 softdep_process_worklist(mp, full)
1579 struct mount *mp;
1580 int full;
1581 {
1582 int cnt, matchcnt;
1583 struct ufsmount *ump;
1584 long starttime;
1585
1586 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1587 if (MOUNTEDSOFTDEP(mp) == 0)
1588 return (0);
1589 matchcnt = 0;
1590 ump = VFSTOUFS(mp);
1591 ACQUIRE_LOCK(ump);
1592 starttime = time_second;
1593 softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1594 check_clear_deps(mp);
1595 while (ump->softdep_on_worklist > 0) {
1596 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1597 break;
1598 else
1599 matchcnt += cnt;
1600 check_clear_deps(mp);
1601 /*
1602 * We do not generally want to stop for buffer space, but if
1603 * we are really being a buffer hog, we will stop and wait.
1604 */
1605 if (should_yield()) {
1606 FREE_LOCK(ump);
1607 kern_yield(PRI_USER);
1608 bwillwrite();
1609 ACQUIRE_LOCK(ump);
1610 }
1611 /*
1612 * Never allow processing to run for more than one
1613 * second. This gives the syncer thread the opportunity
1614 * to pause if appropriate.
1615 */
1616 if (!full && starttime != time_second)
1617 break;
1618 }
1619 if (full == 0)
1620 journal_unsuspend(ump);
1621 FREE_LOCK(ump);
1622 return (matchcnt);
1623 }
1624
1625 /*
1626 * Process all removes associated with a vnode if we are running out of
1627 * journal space. Any other process which attempts to flush these will
1628 * be unable as we have the vnodes locked.
1629 */
1630 static void
1631 process_removes(vp)
1632 struct vnode *vp;
1633 {
1634 struct inodedep *inodedep;
1635 struct dirrem *dirrem;
1636 struct ufsmount *ump;
1637 struct mount *mp;
1638 ino_t inum;
1639
1640 mp = vp->v_mount;
1641 ump = VFSTOUFS(mp);
1642 LOCK_OWNED(ump);
1643 inum = VTOI(vp)->i_number;
1644 for (;;) {
1645 top:
1646 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1647 return;
1648 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1649 /*
1650 * If another thread is trying to lock this vnode
1651 * it will fail but we must wait for it to do so
1652 * before we can proceed.
1653 */
1654 if (dirrem->dm_state & INPROGRESS) {
1655 wait_worklist(&dirrem->dm_list, "pwrwait");
1656 goto top;
1657 }
1658 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1659 (COMPLETE | ONWORKLIST))
1660 break;
1661 }
1662 if (dirrem == NULL)
1663 return;
1664 remove_from_worklist(&dirrem->dm_list);
1665 FREE_LOCK(ump);
1666 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1667 panic("process_removes: suspended filesystem");
1668 handle_workitem_remove(dirrem, 0);
1669 vn_finished_secondary_write(mp);
1670 ACQUIRE_LOCK(ump);
1671 }
1672 }
1673
1674 /*
1675 * Process all truncations associated with a vnode if we are running out
1676 * of journal space. This is called when the vnode lock is already held
1677 * and no other process can clear the truncation. This function returns
1678 * a value greater than zero if it did any work.
1679 */
1680 static void
1681 process_truncates(vp)
1682 struct vnode *vp;
1683 {
1684 struct inodedep *inodedep;
1685 struct freeblks *freeblks;
1686 struct ufsmount *ump;
1687 struct mount *mp;
1688 ino_t inum;
1689 int cgwait;
1690
1691 mp = vp->v_mount;
1692 ump = VFSTOUFS(mp);
1693 LOCK_OWNED(ump);
1694 inum = VTOI(vp)->i_number;
1695 for (;;) {
1696 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1697 return;
1698 cgwait = 0;
1699 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1700 /* Journal entries not yet written. */
1701 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1702 jwait(&LIST_FIRST(
1703 &freeblks->fb_jblkdephd)->jb_list,
1704 MNT_WAIT);
1705 break;
1706 }
1707 /* Another thread is executing this item. */
1708 if (freeblks->fb_state & INPROGRESS) {
1709 wait_worklist(&freeblks->fb_list, "ptrwait");
1710 break;
1711 }
1712 /* Freeblks is waiting on a inode write. */
1713 if ((freeblks->fb_state & COMPLETE) == 0) {
1714 FREE_LOCK(ump);
1715 ffs_update(vp, 1);
1716 ACQUIRE_LOCK(ump);
1717 break;
1718 }
1719 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1720 (ALLCOMPLETE | ONWORKLIST)) {
1721 remove_from_worklist(&freeblks->fb_list);
1722 freeblks->fb_state |= INPROGRESS;
1723 FREE_LOCK(ump);
1724 if (vn_start_secondary_write(NULL, &mp,
1725 V_NOWAIT))
1726 panic("process_truncates: "
1727 "suspended filesystem");
1728 handle_workitem_freeblocks(freeblks, 0);
1729 vn_finished_secondary_write(mp);
1730 ACQUIRE_LOCK(ump);
1731 break;
1732 }
1733 if (freeblks->fb_cgwait)
1734 cgwait++;
1735 }
1736 if (cgwait) {
1737 FREE_LOCK(ump);
1738 sync_cgs(mp, MNT_WAIT);
1739 ffs_sync_snap(mp, MNT_WAIT);
1740 ACQUIRE_LOCK(ump);
1741 continue;
1742 }
1743 if (freeblks == NULL)
1744 break;
1745 }
1746 return;
1747 }
1748
1749 /*
1750 * Process one item on the worklist.
1751 */
1752 static int
1753 process_worklist_item(mp, target, flags)
1754 struct mount *mp;
1755 int target;
1756 int flags;
1757 {
1758 struct worklist sentinel;
1759 struct worklist *wk;
1760 struct ufsmount *ump;
1761 int matchcnt;
1762 int error;
1763
1764 KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1765 /*
1766 * If we are being called because of a process doing a
1767 * copy-on-write, then it is not safe to write as we may
1768 * recurse into the copy-on-write routine.
1769 */
1770 if (curthread->td_pflags & TDP_COWINPROGRESS)
1771 return (-1);
1772 PHOLD(curproc); /* Don't let the stack go away. */
1773 ump = VFSTOUFS(mp);
1774 LOCK_OWNED(ump);
1775 matchcnt = 0;
1776 sentinel.wk_mp = NULL;
1777 sentinel.wk_type = D_SENTINEL;
1778 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1779 for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1780 wk = LIST_NEXT(&sentinel, wk_list)) {
1781 if (wk->wk_type == D_SENTINEL) {
1782 LIST_REMOVE(&sentinel, wk_list);
1783 LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1784 continue;
1785 }
1786 if (wk->wk_state & INPROGRESS)
1787 panic("process_worklist_item: %p already in progress.",
1788 wk);
1789 wk->wk_state |= INPROGRESS;
1790 remove_from_worklist(wk);
1791 FREE_LOCK(ump);
1792 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1793 panic("process_worklist_item: suspended filesystem");
1794 switch (wk->wk_type) {
1795 case D_DIRREM:
1796 /* removal of a directory entry */
1797 error = handle_workitem_remove(WK_DIRREM(wk), flags);
1798 break;
1799
1800 case D_FREEBLKS:
1801 /* releasing blocks and/or fragments from a file */
1802 error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1803 flags);
1804 break;
1805
1806 case D_FREEFRAG:
1807 /* releasing a fragment when replaced as a file grows */
1808 handle_workitem_freefrag(WK_FREEFRAG(wk));
1809 error = 0;
1810 break;
1811
1812 case D_FREEFILE:
1813 /* releasing an inode when its link count drops to 0 */
1814 handle_workitem_freefile(WK_FREEFILE(wk));
1815 error = 0;
1816 break;
1817
1818 default:
1819 panic("%s_process_worklist: Unknown type %s",
1820 "softdep", TYPENAME(wk->wk_type));
1821 /* NOTREACHED */
1822 }
1823 vn_finished_secondary_write(mp);
1824 ACQUIRE_LOCK(ump);
1825 if (error == 0) {
1826 if (++matchcnt == target)
1827 break;
1828 continue;
1829 }
1830 /*
1831 * We have to retry the worklist item later. Wake up any
1832 * waiters who may be able to complete it immediately and
1833 * add the item back to the head so we don't try to execute
1834 * it again.
1835 */
1836 wk->wk_state &= ~INPROGRESS;
1837 wake_worklist(wk);
1838 add_to_worklist(wk, WK_HEAD);
1839 }
1840 /* Sentinal could've become the tail from remove_from_worklist. */
1841 if (ump->softdep_worklist_tail == &sentinel)
1842 ump->softdep_worklist_tail =
1843 (struct worklist *)sentinel.wk_list.le_prev;
1844 LIST_REMOVE(&sentinel, wk_list);
1845 PRELE(curproc);
1846 return (matchcnt);
1847 }
1848
1849 /*
1850 * Move dependencies from one buffer to another.
1851 */
1852 int
1853 softdep_move_dependencies(oldbp, newbp)
1854 struct buf *oldbp;
1855 struct buf *newbp;
1856 {
1857 struct worklist *wk, *wktail;
1858 struct ufsmount *ump;
1859 int dirty;
1860
1861 if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1862 return (0);
1863 KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1864 ("softdep_move_dependencies called on non-softdep filesystem"));
1865 dirty = 0;
1866 wktail = NULL;
1867 ump = VFSTOUFS(wk->wk_mp);
1868 ACQUIRE_LOCK(ump);
1869 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1870 LIST_REMOVE(wk, wk_list);
1871 if (wk->wk_type == D_BMSAFEMAP &&
1872 bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1873 dirty = 1;
1874 if (wktail == NULL)
1875 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1876 else
1877 LIST_INSERT_AFTER(wktail, wk, wk_list);
1878 wktail = wk;
1879 }
1880 FREE_LOCK(ump);
1881
1882 return (dirty);
1883 }
1884
1885 /*
1886 * Purge the work list of all items associated with a particular mount point.
1887 */
1888 int
1889 softdep_flushworklist(oldmnt, countp, td)
1890 struct mount *oldmnt;
1891 int *countp;
1892 struct thread *td;
1893 {
1894 struct vnode *devvp;
1895 struct ufsmount *ump;
1896 int count, error;
1897
1898 /*
1899 * Alternately flush the block device associated with the mount
1900 * point and process any dependencies that the flushing
1901 * creates. We continue until no more worklist dependencies
1902 * are found.
1903 */
1904 *countp = 0;
1905 error = 0;
1906 ump = VFSTOUFS(oldmnt);
1907 devvp = ump->um_devvp;
1908 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1909 *countp += count;
1910 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1911 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1912 VOP_UNLOCK(devvp, 0);
1913 if (error != 0)
1914 break;
1915 }
1916 return (error);
1917 }
1918
1919 #define SU_WAITIDLE_RETRIES 20
1920 static int
1921 softdep_waitidle(struct mount *mp, int flags __unused)
1922 {
1923 struct ufsmount *ump;
1924 struct vnode *devvp;
1925 struct thread *td;
1926 int error, i;
1927
1928 ump = VFSTOUFS(mp);
1929 devvp = ump->um_devvp;
1930 td = curthread;
1931 error = 0;
1932 ACQUIRE_LOCK(ump);
1933 for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1934 ump->softdep_req = 1;
1935 KASSERT((flags & FORCECLOSE) == 0 ||
1936 ump->softdep_on_worklist == 0,
1937 ("softdep_waitidle: work added after flush"));
1938 msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1939 "softdeps", 10 * hz);
1940 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1941 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1942 VOP_UNLOCK(devvp, 0);
1943 ACQUIRE_LOCK(ump);
1944 if (error != 0)
1945 break;
1946 }
1947 ump->softdep_req = 0;
1948 if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1949 error = EBUSY;
1950 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1951 mp);
1952 }
1953 FREE_LOCK(ump);
1954 return (error);
1955 }
1956
1957 /*
1958 * Flush all vnodes and worklist items associated with a specified mount point.
1959 */
1960 int
1961 softdep_flushfiles(oldmnt, flags, td)
1962 struct mount *oldmnt;
1963 int flags;
1964 struct thread *td;
1965 {
1966 #ifdef QUOTA
1967 struct ufsmount *ump;
1968 int i;
1969 #endif
1970 int error, early, depcount, loopcnt, retry_flush_count, retry;
1971 int morework;
1972
1973 KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1974 ("softdep_flushfiles called on non-softdep filesystem"));
1975 loopcnt = 10;
1976 retry_flush_count = 3;
1977 retry_flush:
1978 error = 0;
1979
1980 /*
1981 * Alternately flush the vnodes associated with the mount
1982 * point and process any dependencies that the flushing
1983 * creates. In theory, this loop can happen at most twice,
1984 * but we give it a few extra just to be sure.
1985 */
1986 for (; loopcnt > 0; loopcnt--) {
1987 /*
1988 * Do another flush in case any vnodes were brought in
1989 * as part of the cleanup operations.
1990 */
1991 early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1992 MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1993 if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1994 break;
1995 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1996 depcount == 0)
1997 break;
1998 }
1999 /*
2000 * If we are unmounting then it is an error to fail. If we
2001 * are simply trying to downgrade to read-only, then filesystem
2002 * activity can keep us busy forever, so we just fail with EBUSY.
2003 */
2004 if (loopcnt == 0) {
2005 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2006 panic("softdep_flushfiles: looping");
2007 error = EBUSY;
2008 }
2009 if (!error)
2010 error = softdep_waitidle(oldmnt, flags);
2011 if (!error) {
2012 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2013 retry = 0;
2014 MNT_ILOCK(oldmnt);
2015 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2016 ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2017 morework = oldmnt->mnt_nvnodelistsize > 0;
2018 #ifdef QUOTA
2019 ump = VFSTOUFS(oldmnt);
2020 UFS_LOCK(ump);
2021 for (i = 0; i < MAXQUOTAS; i++) {
2022 if (ump->um_quotas[i] != NULLVP)
2023 morework = 1;
2024 }
2025 UFS_UNLOCK(ump);
2026 #endif
2027 if (morework) {
2028 if (--retry_flush_count > 0) {
2029 retry = 1;
2030 loopcnt = 3;
2031 } else
2032 error = EBUSY;
2033 }
2034 MNT_IUNLOCK(oldmnt);
2035 if (retry)
2036 goto retry_flush;
2037 }
2038 }
2039 return (error);
2040 }
2041
2042 /*
2043 * Structure hashing.
2044 *
2045 * There are four types of structures that can be looked up:
2046 * 1) pagedep structures identified by mount point, inode number,
2047 * and logical block.
2048 * 2) inodedep structures identified by mount point and inode number.
2049 * 3) newblk structures identified by mount point and
2050 * physical block number.
2051 * 4) bmsafemap structures identified by mount point and
2052 * cylinder group number.
2053 *
2054 * The "pagedep" and "inodedep" dependency structures are hashed
2055 * separately from the file blocks and inodes to which they correspond.
2056 * This separation helps when the in-memory copy of an inode or
2057 * file block must be replaced. It also obviates the need to access
2058 * an inode or file page when simply updating (or de-allocating)
2059 * dependency structures. Lookup of newblk structures is needed to
2060 * find newly allocated blocks when trying to associate them with
2061 * their allocdirect or allocindir structure.
2062 *
2063 * The lookup routines optionally create and hash a new instance when
2064 * an existing entry is not found. The bmsafemap lookup routine always
2065 * allocates a new structure if an existing one is not found.
2066 */
2067 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
2068
2069 /*
2070 * Structures and routines associated with pagedep caching.
2071 */
2072 #define PAGEDEP_HASH(ump, inum, lbn) \
2073 (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2074
2075 static int
2076 pagedep_find(pagedephd, ino, lbn, pagedeppp)
2077 struct pagedep_hashhead *pagedephd;
2078 ino_t ino;
2079 ufs_lbn_t lbn;
2080 struct pagedep **pagedeppp;
2081 {
2082 struct pagedep *pagedep;
2083
2084 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2085 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2086 *pagedeppp = pagedep;
2087 return (1);
2088 }
2089 }
2090 *pagedeppp = NULL;
2091 return (0);
2092 }
2093 /*
2094 * Look up a pagedep. Return 1 if found, 0 otherwise.
2095 * If not found, allocate if DEPALLOC flag is passed.
2096 * Found or allocated entry is returned in pagedeppp.
2097 * This routine must be called with splbio interrupts blocked.
2098 */
2099 static int
2100 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2101 struct mount *mp;
2102 struct buf *bp;
2103 ino_t ino;
2104 ufs_lbn_t lbn;
2105 int flags;
2106 struct pagedep **pagedeppp;
2107 {
2108 struct pagedep *pagedep;
2109 struct pagedep_hashhead *pagedephd;
2110 struct worklist *wk;
2111 struct ufsmount *ump;
2112 int ret;
2113 int i;
2114
2115 ump = VFSTOUFS(mp);
2116 LOCK_OWNED(ump);
2117 if (bp) {
2118 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2119 if (wk->wk_type == D_PAGEDEP) {
2120 *pagedeppp = WK_PAGEDEP(wk);
2121 return (1);
2122 }
2123 }
2124 }
2125 pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2126 ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2127 if (ret) {
2128 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2129 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2130 return (1);
2131 }
2132 if ((flags & DEPALLOC) == 0)
2133 return (0);
2134 FREE_LOCK(ump);
2135 pagedep = malloc(sizeof(struct pagedep),
2136 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2137 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2138 ACQUIRE_LOCK(ump);
2139 ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2140 if (*pagedeppp) {
2141 /*
2142 * This should never happen since we only create pagedeps
2143 * with the vnode lock held. Could be an assert.
2144 */
2145 WORKITEM_FREE(pagedep, D_PAGEDEP);
2146 return (ret);
2147 }
2148 pagedep->pd_ino = ino;
2149 pagedep->pd_lbn = lbn;
2150 LIST_INIT(&pagedep->pd_dirremhd);
2151 LIST_INIT(&pagedep->pd_pendinghd);
2152 for (i = 0; i < DAHASHSZ; i++)
2153 LIST_INIT(&pagedep->pd_diraddhd[i]);
2154 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2155 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2156 *pagedeppp = pagedep;
2157 return (0);
2158 }
2159
2160 /*
2161 * Structures and routines associated with inodedep caching.
2162 */
2163 #define INODEDEP_HASH(ump, inum) \
2164 (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2165
2166 static int
2167 inodedep_find(inodedephd, inum, inodedeppp)
2168 struct inodedep_hashhead *inodedephd;
2169 ino_t inum;
2170 struct inodedep **inodedeppp;
2171 {
2172 struct inodedep *inodedep;
2173
2174 LIST_FOREACH(inodedep, inodedephd, id_hash)
2175 if (inum == inodedep->id_ino)
2176 break;
2177 if (inodedep) {
2178 *inodedeppp = inodedep;
2179 return (1);
2180 }
2181 *inodedeppp = NULL;
2182
2183 return (0);
2184 }
2185 /*
2186 * Look up an inodedep. Return 1 if found, 0 if not found.
2187 * If not found, allocate if DEPALLOC flag is passed.
2188 * Found or allocated entry is returned in inodedeppp.
2189 * This routine must be called with splbio interrupts blocked.
2190 */
2191 static int
2192 inodedep_lookup(mp, inum, flags, inodedeppp)
2193 struct mount *mp;
2194 ino_t inum;
2195 int flags;
2196 struct inodedep **inodedeppp;
2197 {
2198 struct inodedep *inodedep;
2199 struct inodedep_hashhead *inodedephd;
2200 struct ufsmount *ump;
2201 struct fs *fs;
2202
2203 ump = VFSTOUFS(mp);
2204 LOCK_OWNED(ump);
2205 fs = ump->um_fs;
2206 inodedephd = INODEDEP_HASH(ump, inum);
2207
2208 if (inodedep_find(inodedephd, inum, inodedeppp))
2209 return (1);
2210 if ((flags & DEPALLOC) == 0)
2211 return (0);
2212 /*
2213 * If the system is over its limit and our filesystem is
2214 * responsible for more than our share of that usage and
2215 * we are not in a rush, request some inodedep cleanup.
2216 */
2217 if (softdep_excess_items(ump, D_INODEDEP))
2218 schedule_cleanup(mp);
2219 else
2220 FREE_LOCK(ump);
2221 inodedep = malloc(sizeof(struct inodedep),
2222 M_INODEDEP, M_SOFTDEP_FLAGS);
2223 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2224 ACQUIRE_LOCK(ump);
2225 if (inodedep_find(inodedephd, inum, inodedeppp)) {
2226 WORKITEM_FREE(inodedep, D_INODEDEP);
2227 return (1);
2228 }
2229 inodedep->id_fs = fs;
2230 inodedep->id_ino = inum;
2231 inodedep->id_state = ALLCOMPLETE;
2232 inodedep->id_nlinkdelta = 0;
2233 inodedep->id_savedino1 = NULL;
2234 inodedep->id_savedsize = -1;
2235 inodedep->id_savedextsize = -1;
2236 inodedep->id_savednlink = -1;
2237 inodedep->id_bmsafemap = NULL;
2238 inodedep->id_mkdiradd = NULL;
2239 LIST_INIT(&inodedep->id_dirremhd);
2240 LIST_INIT(&inodedep->id_pendinghd);
2241 LIST_INIT(&inodedep->id_inowait);
2242 LIST_INIT(&inodedep->id_bufwait);
2243 TAILQ_INIT(&inodedep->id_inoreflst);
2244 TAILQ_INIT(&inodedep->id_inoupdt);
2245 TAILQ_INIT(&inodedep->id_newinoupdt);
2246 TAILQ_INIT(&inodedep->id_extupdt);
2247 TAILQ_INIT(&inodedep->id_newextupdt);
2248 TAILQ_INIT(&inodedep->id_freeblklst);
2249 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2250 *inodedeppp = inodedep;
2251 return (0);
2252 }
2253
2254 /*
2255 * Structures and routines associated with newblk caching.
2256 */
2257 #define NEWBLK_HASH(ump, inum) \
2258 (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2259
2260 static int
2261 newblk_find(newblkhd, newblkno, flags, newblkpp)
2262 struct newblk_hashhead *newblkhd;
2263 ufs2_daddr_t newblkno;
2264 int flags;
2265 struct newblk **newblkpp;
2266 {
2267 struct newblk *newblk;
2268
2269 LIST_FOREACH(newblk, newblkhd, nb_hash) {
2270 if (newblkno != newblk->nb_newblkno)
2271 continue;
2272 /*
2273 * If we're creating a new dependency don't match those that
2274 * have already been converted to allocdirects. This is for
2275 * a frag extend.
2276 */
2277 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2278 continue;
2279 break;
2280 }
2281 if (newblk) {
2282 *newblkpp = newblk;
2283 return (1);
2284 }
2285 *newblkpp = NULL;
2286 return (0);
2287 }
2288
2289 /*
2290 * Look up a newblk. Return 1 if found, 0 if not found.
2291 * If not found, allocate if DEPALLOC flag is passed.
2292 * Found or allocated entry is returned in newblkpp.
2293 */
2294 static int
2295 newblk_lookup(mp, newblkno, flags, newblkpp)
2296 struct mount *mp;
2297 ufs2_daddr_t newblkno;
2298 int flags;
2299 struct newblk **newblkpp;
2300 {
2301 struct newblk *newblk;
2302 struct newblk_hashhead *newblkhd;
2303 struct ufsmount *ump;
2304
2305 ump = VFSTOUFS(mp);
2306 LOCK_OWNED(ump);
2307 newblkhd = NEWBLK_HASH(ump, newblkno);
2308 if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2309 return (1);
2310 if ((flags & DEPALLOC) == 0)
2311 return (0);
2312 if (softdep_excess_items(ump, D_NEWBLK) ||
2313 softdep_excess_items(ump, D_ALLOCDIRECT) ||
2314 softdep_excess_items(ump, D_ALLOCINDIR))
2315 schedule_cleanup(mp);
2316 else
2317 FREE_LOCK(ump);
2318 newblk = malloc(sizeof(union allblk), M_NEWBLK,
2319 M_SOFTDEP_FLAGS | M_ZERO);
2320 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2321 ACQUIRE_LOCK(ump);
2322 if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2323 WORKITEM_FREE(newblk, D_NEWBLK);
2324 return (1);
2325 }
2326 newblk->nb_freefrag = NULL;
2327 LIST_INIT(&newblk->nb_indirdeps);
2328 LIST_INIT(&newblk->nb_newdirblk);
2329 LIST_INIT(&newblk->nb_jwork);
2330 newblk->nb_state = ATTACHED;
2331 newblk->nb_newblkno = newblkno;
2332 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2333 *newblkpp = newblk;
2334 return (0);
2335 }
2336
2337 /*
2338 * Structures and routines associated with freed indirect block caching.
2339 */
2340 #define INDIR_HASH(ump, blkno) \
2341 (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2342
2343 /*
2344 * Lookup an indirect block in the indir hash table. The freework is
2345 * removed and potentially freed. The caller must do a blocking journal
2346 * write before writing to the blkno.
2347 */
2348 static int
2349 indirblk_lookup(mp, blkno)
2350 struct mount *mp;
2351 ufs2_daddr_t blkno;
2352 {
2353 struct freework *freework;
2354 struct indir_hashhead *wkhd;
2355 struct ufsmount *ump;
2356
2357 ump = VFSTOUFS(mp);
2358 wkhd = INDIR_HASH(ump, blkno);
2359 TAILQ_FOREACH(freework, wkhd, fw_next) {
2360 if (freework->fw_blkno != blkno)
2361 continue;
2362 indirblk_remove(freework);
2363 return (1);
2364 }
2365 return (0);
2366 }
2367
2368 /*
2369 * Insert an indirect block represented by freework into the indirblk
2370 * hash table so that it may prevent the block from being re-used prior
2371 * to the journal being written.
2372 */
2373 static void
2374 indirblk_insert(freework)
2375 struct freework *freework;
2376 {
2377 struct jblocks *jblocks;
2378 struct jseg *jseg;
2379 struct ufsmount *ump;
2380
2381 ump = VFSTOUFS(freework->fw_list.wk_mp);
2382 jblocks = ump->softdep_jblocks;
2383 jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2384 if (jseg == NULL)
2385 return;
2386
2387 LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2388 TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2389 fw_next);
2390 freework->fw_state &= ~DEPCOMPLETE;
2391 }
2392
2393 static void
2394 indirblk_remove(freework)
2395 struct freework *freework;
2396 {
2397 struct ufsmount *ump;
2398
2399 ump = VFSTOUFS(freework->fw_list.wk_mp);
2400 LIST_REMOVE(freework, fw_segs);
2401 TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2402 freework->fw_state |= DEPCOMPLETE;
2403 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2404 WORKITEM_FREE(freework, D_FREEWORK);
2405 }
2406
2407 /*
2408 * Executed during filesystem system initialization before
2409 * mounting any filesystems.
2410 */
2411 void
2412 softdep_initialize()
2413 {
2414
2415 TAILQ_INIT(&softdepmounts);
2416 #ifdef __LP64__
2417 max_softdeps = desiredvnodes * 4;
2418 #else
2419 max_softdeps = desiredvnodes * 2;
2420 #endif
2421
2422 /* initialise bioops hack */
2423 bioops.io_start = softdep_disk_io_initiation;
2424 bioops.io_complete = softdep_disk_write_complete;
2425 bioops.io_deallocate = softdep_deallocate_dependencies;
2426 bioops.io_countdeps = softdep_count_dependencies;
2427 softdep_ast_cleanup = softdep_ast_cleanup_proc;
2428
2429 /* Initialize the callout with an mtx. */
2430 callout_init_mtx(&softdep_callout, &lk, 0);
2431 }
2432
2433 /*
2434 * Executed after all filesystems have been unmounted during
2435 * filesystem module unload.
2436 */
2437 void
2438 softdep_uninitialize()
2439 {
2440
2441 /* clear bioops hack */
2442 bioops.io_start = NULL;
2443 bioops.io_complete = NULL;
2444 bioops.io_deallocate = NULL;
2445 bioops.io_countdeps = NULL;
2446 softdep_ast_cleanup = NULL;
2447
2448 callout_drain(&softdep_callout);
2449 }
2450
2451 /*
2452 * Called at mount time to notify the dependency code that a
2453 * filesystem wishes to use it.
2454 */
2455 int
2456 softdep_mount(devvp, mp, fs, cred)
2457 struct vnode *devvp;
2458 struct mount *mp;
2459 struct fs *fs;
2460 struct ucred *cred;
2461 {
2462 struct csum_total cstotal;
2463 struct mount_softdeps *sdp;
2464 struct ufsmount *ump;
2465 struct cg *cgp;
2466 struct buf *bp;
2467 int i, error, cyl;
2468
2469 sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2470 M_WAITOK | M_ZERO);
2471 MNT_ILOCK(mp);
2472 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2473 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2474 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2475 MNTK_SOFTDEP | MNTK_NOASYNC;
2476 }
2477 ump = VFSTOUFS(mp);
2478 ump->um_softdep = sdp;
2479 MNT_IUNLOCK(mp);
2480 rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2481 sdp->sd_ump = ump;
2482 LIST_INIT(&ump->softdep_workitem_pending);
2483 LIST_INIT(&ump->softdep_journal_pending);
2484 TAILQ_INIT(&ump->softdep_unlinked);
2485 LIST_INIT(&ump->softdep_dirtycg);
2486 ump->softdep_worklist_tail = NULL;
2487 ump->softdep_on_worklist = 0;
2488 ump->softdep_deps = 0;
2489 LIST_INIT(&ump->softdep_mkdirlisthd);
2490 ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2491 &ump->pagedep_hash_size);
2492 ump->pagedep_nextclean = 0;
2493 ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2494 &ump->inodedep_hash_size);
2495 ump->inodedep_nextclean = 0;
2496 ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK,
2497 &ump->newblk_hash_size);
2498 ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2499 &ump->bmsafemap_hash_size);
2500 i = 1 << (ffs(desiredvnodes / 10) - 1);
2501 ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2502 M_FREEWORK, M_WAITOK);
2503 ump->indir_hash_size = i - 1;
2504 for (i = 0; i <= ump->indir_hash_size; i++)
2505 TAILQ_INIT(&ump->indir_hashtbl[i]);
2506 ACQUIRE_GBLLOCK(&lk);
2507 TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2508 FREE_GBLLOCK(&lk);
2509 if ((fs->fs_flags & FS_SUJ) &&
2510 (error = journal_mount(mp, fs, cred)) != 0) {
2511 printf("Failed to start journal: %d\n", error);
2512 softdep_unmount(mp);
2513 return (error);
2514 }
2515 /*
2516 * Start our flushing thread in the bufdaemon process.
2517 */
2518 ACQUIRE_LOCK(ump);
2519 ump->softdep_flags |= FLUSH_STARTING;
2520 FREE_LOCK(ump);
2521 kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2522 &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2523 mp->mnt_stat.f_mntonname);
2524 ACQUIRE_LOCK(ump);
2525 while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2526 msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2527 hz / 2);
2528 }
2529 FREE_LOCK(ump);
2530 /*
2531 * When doing soft updates, the counters in the
2532 * superblock may have gotten out of sync. Recomputation
2533 * can take a long time and can be deferred for background
2534 * fsck. However, the old behavior of scanning the cylinder
2535 * groups and recalculating them at mount time is available
2536 * by setting vfs.ffs.compute_summary_at_mount to one.
2537 */
2538 if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2539 return (0);
2540 bzero(&cstotal, sizeof cstotal);
2541 for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2542 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2543 fs->fs_cgsize, cred, &bp)) != 0) {
2544 brelse(bp);
2545 softdep_unmount(mp);
2546 return (error);
2547 }
2548 cgp = (struct cg *)bp->b_data;
2549 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2550 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2551 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2552 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2553 fs->fs_cs(fs, cyl) = cgp->cg_cs;
2554 brelse(bp);
2555 }
2556 #ifdef DEBUG
2557 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2558 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2559 #endif
2560 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2561 return (0);
2562 }
2563
2564 void
2565 softdep_unmount(mp)
2566 struct mount *mp;
2567 {
2568 struct ufsmount *ump;
2569 #ifdef INVARIANTS
2570 int i;
2571 #endif
2572
2573 KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2574 ("softdep_unmount called on non-softdep filesystem"));
2575 ump = VFSTOUFS(mp);
2576 MNT_ILOCK(mp);
2577 mp->mnt_flag &= ~MNT_SOFTDEP;
2578 if (MOUNTEDSUJ(mp) == 0) {
2579 MNT_IUNLOCK(mp);
2580 } else {
2581 mp->mnt_flag &= ~MNT_SUJ;
2582 MNT_IUNLOCK(mp);
2583 journal_unmount(ump);
2584 }
2585 /*
2586 * Shut down our flushing thread. Check for NULL is if
2587 * softdep_mount errors out before the thread has been created.
2588 */
2589 if (ump->softdep_flushtd != NULL) {
2590 ACQUIRE_LOCK(ump);
2591 ump->softdep_flags |= FLUSH_EXIT;
2592 wakeup(&ump->softdep_flushtd);
2593 msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2594 "sdwait", 0);
2595 KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2596 ("Thread shutdown failed"));
2597 }
2598 /*
2599 * Free up our resources.
2600 */
2601 ACQUIRE_GBLLOCK(&lk);
2602 TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2603 FREE_GBLLOCK(&lk);
2604 rw_destroy(LOCK_PTR(ump));
2605 hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2606 hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2607 hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2608 hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2609 ump->bmsafemap_hash_size);
2610 free(ump->indir_hashtbl, M_FREEWORK);
2611 #ifdef INVARIANTS
2612 for (i = 0; i <= D_LAST; i++)
2613 KASSERT(ump->softdep_curdeps[i] == 0,
2614 ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2615 TYPENAME(i), ump->softdep_curdeps[i]));
2616 #endif
2617 free(ump->um_softdep, M_MOUNTDATA);
2618 }
2619
2620 static struct jblocks *
2621 jblocks_create(void)
2622 {
2623 struct jblocks *jblocks;
2624
2625 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2626 TAILQ_INIT(&jblocks->jb_segs);
2627 jblocks->jb_avail = 10;
2628 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2629 M_JBLOCKS, M_WAITOK | M_ZERO);
2630
2631 return (jblocks);
2632 }
2633
2634 static ufs2_daddr_t
2635 jblocks_alloc(jblocks, bytes, actual)
2636 struct jblocks *jblocks;
2637 int bytes;
2638 int *actual;
2639 {
2640 ufs2_daddr_t daddr;
2641 struct jextent *jext;
2642 int freecnt;
2643 int blocks;
2644
2645 blocks = bytes / DEV_BSIZE;
2646 jext = &jblocks->jb_extent[jblocks->jb_head];
2647 freecnt = jext->je_blocks - jblocks->jb_off;
2648 if (freecnt == 0) {
2649 jblocks->jb_off = 0;
2650 if (++jblocks->jb_head > jblocks->jb_used)
2651 jblocks->jb_head = 0;
2652 jext = &jblocks->jb_extent[jblocks->jb_head];
2653 freecnt = jext->je_blocks;
2654 }
2655 if (freecnt > blocks)
2656 freecnt = blocks;
2657 *actual = freecnt * DEV_BSIZE;
2658 daddr = jext->je_daddr + jblocks->jb_off;
2659 jblocks->jb_off += freecnt;
2660 jblocks->jb_free -= freecnt;
2661
2662 return (daddr);
2663 }
2664
2665 static void
2666 jblocks_free(jblocks, mp, bytes)
2667 struct jblocks *jblocks;
2668 struct mount *mp;
2669 int bytes;
2670 {
2671
2672 LOCK_OWNED(VFSTOUFS(mp));
2673 jblocks->jb_free += bytes / DEV_BSIZE;
2674 if (jblocks->jb_suspended)
2675 worklist_speedup(mp);
2676 wakeup(jblocks);
2677 }
2678
2679 static void
2680 jblocks_destroy(jblocks)
2681 struct jblocks *jblocks;
2682 {
2683
2684 if (jblocks->jb_extent)
2685 free(jblocks->jb_extent, M_JBLOCKS);
2686 free(jblocks, M_JBLOCKS);
2687 }
2688
2689 static void
2690 jblocks_add(jblocks, daddr, blocks)
2691 struct jblocks *jblocks;
2692 ufs2_daddr_t daddr;
2693 int blocks;
2694 {
2695 struct jextent *jext;
2696
2697 jblocks->jb_blocks += blocks;
2698 jblocks->jb_free += blocks;
2699 jext = &jblocks->jb_extent[jblocks->jb_used];
2700 /* Adding the first block. */
2701 if (jext->je_daddr == 0) {
2702 jext->je_daddr = daddr;
2703 jext->je_blocks = blocks;
2704 return;
2705 }
2706 /* Extending the last extent. */
2707 if (jext->je_daddr + jext->je_blocks == daddr) {
2708 jext->je_blocks += blocks;
2709 return;
2710 }
2711 /* Adding a new extent. */
2712 if (++jblocks->jb_used == jblocks->jb_avail) {
2713 jblocks->jb_avail *= 2;
2714 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2715 M_JBLOCKS, M_WAITOK | M_ZERO);
2716 memcpy(jext, jblocks->jb_extent,
2717 sizeof(struct jextent) * jblocks->jb_used);
2718 free(jblocks->jb_extent, M_JBLOCKS);
2719 jblocks->jb_extent = jext;
2720 }
2721 jext = &jblocks->jb_extent[jblocks->jb_used];
2722 jext->je_daddr = daddr;
2723 jext->je_blocks = blocks;
2724 return;
2725 }
2726
2727 int
2728 softdep_journal_lookup(mp, vpp)
2729 struct mount *mp;
2730 struct vnode **vpp;
2731 {
2732 struct componentname cnp;
2733 struct vnode *dvp;
2734 ino_t sujournal;
2735 int error;
2736
2737 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2738 if (error)
2739 return (error);
2740 bzero(&cnp, sizeof(cnp));
2741 cnp.cn_nameiop = LOOKUP;
2742 cnp.cn_flags = ISLASTCN;
2743 cnp.cn_thread = curthread;
2744 cnp.cn_cred = curthread->td_ucred;
2745 cnp.cn_pnbuf = SUJ_FILE;
2746 cnp.cn_nameptr = SUJ_FILE;
2747 cnp.cn_namelen = strlen(SUJ_FILE);
2748 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2749 vput(dvp);
2750 if (error != 0)
2751 return (error);
2752 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2753 return (error);
2754 }
2755
2756 /*
2757 * Open and verify the journal file.
2758 */
2759 static int
2760 journal_mount(mp, fs, cred)
2761 struct mount *mp;
2762 struct fs *fs;
2763 struct ucred *cred;
2764 {
2765 struct jblocks *jblocks;
2766 struct ufsmount *ump;
2767 struct vnode *vp;
2768 struct inode *ip;
2769 ufs2_daddr_t blkno;
2770 int bcount;
2771 int error;
2772 int i;
2773
2774 ump = VFSTOUFS(mp);
2775 ump->softdep_journal_tail = NULL;
2776 ump->softdep_on_journal = 0;
2777 ump->softdep_accdeps = 0;
2778 ump->softdep_req = 0;
2779 ump->softdep_jblocks = NULL;
2780 error = softdep_journal_lookup(mp, &vp);
2781 if (error != 0) {
2782 printf("Failed to find journal. Use tunefs to create one\n");
2783 return (error);
2784 }
2785 ip = VTOI(vp);
2786 if (ip->i_size < SUJ_MIN) {
2787 error = ENOSPC;
2788 goto out;
2789 }
2790 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */
2791 jblocks = jblocks_create();
2792 for (i = 0; i < bcount; i++) {
2793 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2794 if (error)
2795 break;
2796 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2797 }
2798 if (error) {
2799 jblocks_destroy(jblocks);
2800 goto out;
2801 }
2802 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
2803 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2804 ump->softdep_jblocks = jblocks;
2805 out:
2806 if (error == 0) {
2807 MNT_ILOCK(mp);
2808 mp->mnt_flag |= MNT_SUJ;
2809 mp->mnt_flag &= ~MNT_SOFTDEP;
2810 MNT_IUNLOCK(mp);
2811 /*
2812 * Only validate the journal contents if the
2813 * filesystem is clean, otherwise we write the logs
2814 * but they'll never be used. If the filesystem was
2815 * still dirty when we mounted it the journal is
2816 * invalid and a new journal can only be valid if it
2817 * starts from a clean mount.
2818 */
2819 if (fs->fs_clean) {
2820 DIP_SET(ip, i_modrev, fs->fs_mtime);
2821 ip->i_flags |= IN_MODIFIED;
2822 ffs_update(vp, 1);
2823 }
2824 }
2825 vput(vp);
2826 return (error);
2827 }
2828
2829 static void
2830 journal_unmount(ump)
2831 struct ufsmount *ump;
2832 {
2833
2834 if (ump->softdep_jblocks)
2835 jblocks_destroy(ump->softdep_jblocks);
2836 ump->softdep_jblocks = NULL;
2837 }
2838
2839 /*
2840 * Called when a journal record is ready to be written. Space is allocated
2841 * and the journal entry is created when the journal is flushed to stable
2842 * store.
2843 */
2844 static void
2845 add_to_journal(wk)
2846 struct worklist *wk;
2847 {
2848 struct ufsmount *ump;
2849
2850 ump = VFSTOUFS(wk->wk_mp);
2851 LOCK_OWNED(ump);
2852 if (wk->wk_state & ONWORKLIST)
2853 panic("add_to_journal: %s(0x%X) already on list",
2854 TYPENAME(wk->wk_type), wk->wk_state);
2855 wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2856 if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2857 ump->softdep_jblocks->jb_age = ticks;
2858 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2859 } else
2860 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2861 ump->softdep_journal_tail = wk;
2862 ump->softdep_on_journal += 1;
2863 }
2864
2865 /*
2866 * Remove an arbitrary item for the journal worklist maintain the tail
2867 * pointer. This happens when a new operation obviates the need to
2868 * journal an old operation.
2869 */
2870 static void
2871 remove_from_journal(wk)
2872 struct worklist *wk;
2873 {
2874 struct ufsmount *ump;
2875
2876 ump = VFSTOUFS(wk->wk_mp);
2877 LOCK_OWNED(ump);
2878 #ifdef SUJ_DEBUG
2879 {
2880 struct worklist *wkn;
2881
2882 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2883 if (wkn == wk)
2884 break;
2885 if (wkn == NULL)
2886 panic("remove_from_journal: %p is not in journal", wk);
2887 }
2888 #endif
2889 /*
2890 * We emulate a TAILQ to save space in most structures which do not
2891 * require TAILQ semantics. Here we must update the tail position
2892 * when removing the tail which is not the final entry. This works
2893 * only if the worklist linkage are at the beginning of the structure.
2894 */
2895 if (ump->softdep_journal_tail == wk)
2896 ump->softdep_journal_tail =
2897 (struct worklist *)wk->wk_list.le_prev;
2898 WORKLIST_REMOVE(wk);
2899 ump->softdep_on_journal -= 1;
2900 }
2901
2902 /*
2903 * Check for journal space as well as dependency limits so the prelink
2904 * code can throttle both journaled and non-journaled filesystems.
2905 * Threshold is 0 for low and 1 for min.
2906 */
2907 static int
2908 journal_space(ump, thresh)
2909 struct ufsmount *ump;
2910 int thresh;
2911 {
2912 struct jblocks *jblocks;
2913 int limit, avail;
2914
2915 jblocks = ump->softdep_jblocks;
2916 if (jblocks == NULL)
2917 return (1);
2918 /*
2919 * We use a tighter restriction here to prevent request_cleanup()
2920 * running in threads from running into locks we currently hold.
2921 * We have to be over the limit and our filesystem has to be
2922 * responsible for more than our share of that usage.
2923 */
2924 limit = (max_softdeps / 10) * 9;
2925 if (dep_current[D_INODEDEP] > limit &&
2926 ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2927 return (0);
2928 if (thresh)
2929 thresh = jblocks->jb_min;
2930 else
2931 thresh = jblocks->jb_low;
2932 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2933 avail = jblocks->jb_free - avail;
2934
2935 return (avail > thresh);
2936 }
2937
2938 static void
2939 journal_suspend(ump)
2940 struct ufsmount *ump;
2941 {
2942 struct jblocks *jblocks;
2943 struct mount *mp;
2944
2945 mp = UFSTOVFS(ump);
2946 jblocks = ump->softdep_jblocks;
2947 MNT_ILOCK(mp);
2948 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2949 stat_journal_min++;
2950 mp->mnt_kern_flag |= MNTK_SUSPEND;
2951 mp->mnt_susp_owner = ump->softdep_flushtd;
2952 }
2953 jblocks->jb_suspended = 1;
2954 MNT_IUNLOCK(mp);
2955 }
2956
2957 static int
2958 journal_unsuspend(struct ufsmount *ump)
2959 {
2960 struct jblocks *jblocks;
2961 struct mount *mp;
2962
2963 mp = UFSTOVFS(ump);
2964 jblocks = ump->softdep_jblocks;
2965
2966 if (jblocks != NULL && jblocks->jb_suspended &&
2967 journal_space(ump, jblocks->jb_min)) {
2968 jblocks->jb_suspended = 0;
2969 FREE_LOCK(ump);
2970 mp->mnt_susp_owner = curthread;
2971 vfs_write_resume(mp, 0);
2972 ACQUIRE_LOCK(ump);
2973 return (1);
2974 }
2975 return (0);
2976 }
2977
2978 /*
2979 * Called before any allocation function to be certain that there is
2980 * sufficient space in the journal prior to creating any new records.
2981 * Since in the case of block allocation we may have multiple locked
2982 * buffers at the time of the actual allocation we can not block
2983 * when the journal records are created. Doing so would create a deadlock
2984 * if any of these buffers needed to be flushed to reclaim space. Instead
2985 * we require a sufficiently large amount of available space such that
2986 * each thread in the system could have passed this allocation check and
2987 * still have sufficient free space. With 20% of a minimum journal size
2988 * of 1MB we have 6553 records available.
2989 */
2990 int
2991 softdep_prealloc(vp, waitok)
2992 struct vnode *vp;
2993 int waitok;
2994 {
2995 struct ufsmount *ump;
2996
2997 KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2998 ("softdep_prealloc called on non-softdep filesystem"));
2999 /*
3000 * Nothing to do if we are not running journaled soft updates.
3001 * If we currently hold the snapshot lock, we must avoid
3002 * handling other resources that could cause deadlock. Do not
3003 * touch quotas vnode since it is typically recursed with
3004 * other vnode locks held.
3005 */
3006 if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3007 (vp->v_vflag & VV_SYSTEM) != 0)
3008 return (0);
3009 ump = VFSTOUFS(vp->v_mount);
3010 ACQUIRE_LOCK(ump);
3011 if (journal_space(ump, 0)) {
3012 FREE_LOCK(ump);
3013 return (0);
3014 }
3015 stat_journal_low++;
3016 FREE_LOCK(ump);
3017 if (waitok == MNT_NOWAIT)
3018 return (ENOSPC);
3019 /*
3020 * Attempt to sync this vnode once to flush any journal
3021 * work attached to it.
3022 */
3023 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3024 ffs_syncvnode(vp, waitok, 0);
3025 ACQUIRE_LOCK(ump);
3026 process_removes(vp);
3027 process_truncates(vp);
3028 if (journal_space(ump, 0) == 0) {
3029 softdep_speedup(ump);
3030 if (journal_space(ump, 1) == 0)
3031 journal_suspend(ump);
3032 }
3033 FREE_LOCK(ump);
3034
3035 return (0);
3036 }
3037
3038 /*
3039 * Before adjusting a link count on a vnode verify that we have sufficient
3040 * journal space. If not, process operations that depend on the currently
3041 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3042 * and softdep flush threads can not acquire these locks to reclaim space.
3043 */
3044 static void
3045 softdep_prelink(dvp, vp)
3046 struct vnode *dvp;
3047 struct vnode *vp;
3048 {
3049 struct ufsmount *ump;
3050
3051 ump = VFSTOUFS(dvp->v_mount);
3052 LOCK_OWNED(ump);
3053 /*
3054 * Nothing to do if we have sufficient journal space.
3055 * If we currently hold the snapshot lock, we must avoid
3056 * handling other resources that could cause deadlock.
3057 */
3058 if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3059 return;
3060 stat_journal_low++;
3061 FREE_LOCK(ump);
3062 if (vp)
3063 ffs_syncvnode(vp, MNT_NOWAIT, 0);
3064 ffs_syncvnode(dvp, MNT_WAIT, 0);
3065 ACQUIRE_LOCK(ump);
3066 /* Process vp before dvp as it may create .. removes. */
3067 if (vp) {
3068 process_removes(vp);
3069 process_truncates(vp);
3070 }
3071 process_removes(dvp);
3072 process_truncates(dvp);
3073 softdep_speedup(ump);
3074 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3075 if (journal_space(ump, 0) == 0) {
3076 softdep_speedup(ump);
3077 if (journal_space(ump, 1) == 0)
3078 journal_suspend(ump);
3079 }
3080 }
3081
3082 static void
3083 jseg_write(ump, jseg, data)
3084 struct ufsmount *ump;
3085 struct jseg *jseg;
3086 uint8_t *data;
3087 {
3088 struct jsegrec *rec;
3089
3090 rec = (struct jsegrec *)data;
3091 rec->jsr_seq = jseg->js_seq;
3092 rec->jsr_oldest = jseg->js_oldseq;
3093 rec->jsr_cnt = jseg->js_cnt;
3094 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3095 rec->jsr_crc = 0;
3096 rec->jsr_time = ump->um_fs->fs_mtime;
3097 }
3098
3099 static inline void
3100 inoref_write(inoref, jseg, rec)
3101 struct inoref *inoref;
3102 struct jseg *jseg;
3103 struct jrefrec *rec;
3104 {
3105
3106 inoref->if_jsegdep->jd_seg = jseg;
3107 rec->jr_ino = inoref->if_ino;
3108 rec->jr_parent = inoref->if_parent;
3109 rec->jr_nlink = inoref->if_nlink;
3110 rec->jr_mode = inoref->if_mode;
3111 rec->jr_diroff = inoref->if_diroff;
3112 }
3113
3114 static void
3115 jaddref_write(jaddref, jseg, data)
3116 struct jaddref *jaddref;
3117 struct jseg *jseg;
3118 uint8_t *data;
3119 {
3120 struct jrefrec *rec;
3121
3122 rec = (struct jrefrec *)data;
3123 rec->jr_op = JOP_ADDREF;
3124 inoref_write(&jaddref->ja_ref, jseg, rec);
3125 }
3126
3127 static void
3128 jremref_write(jremref, jseg, data)
3129 struct jremref *jremref;
3130 struct jseg *jseg;
3131 uint8_t *data;
3132 {
3133 struct jrefrec *rec;
3134
3135 rec = (struct jrefrec *)data;
3136 rec->jr_op = JOP_REMREF;
3137 inoref_write(&jremref->jr_ref, jseg, rec);
3138 }
3139
3140 static void
3141 jmvref_write(jmvref, jseg, data)
3142 struct jmvref *jmvref;
3143 struct jseg *jseg;
3144 uint8_t *data;
3145 {
3146 struct jmvrec *rec;
3147
3148 rec = (struct jmvrec *)data;
3149 rec->jm_op = JOP_MVREF;
3150 rec->jm_ino = jmvref->jm_ino;
3151 rec->jm_parent = jmvref->jm_parent;
3152 rec->jm_oldoff = jmvref->jm_oldoff;
3153 rec->jm_newoff = jmvref->jm_newoff;
3154 }
3155
3156 static void
3157 jnewblk_write(jnewblk, jseg, data)
3158 struct jnewblk *jnewblk;
3159 struct jseg *jseg;
3160 uint8_t *data;
3161 {
3162 struct jblkrec *rec;
3163
3164 jnewblk->jn_jsegdep->jd_seg = jseg;
3165 rec = (struct jblkrec *)data;
3166 rec->jb_op = JOP_NEWBLK;
3167 rec->jb_ino = jnewblk->jn_ino;
3168 rec->jb_blkno = jnewblk->jn_blkno;
3169 rec->jb_lbn = jnewblk->jn_lbn;
3170 rec->jb_frags = jnewblk->jn_frags;
3171 rec->jb_oldfrags = jnewblk->jn_oldfrags;
3172 }
3173
3174 static void
3175 jfreeblk_write(jfreeblk, jseg, data)
3176 struct jfreeblk *jfreeblk;
3177 struct jseg *jseg;
3178 uint8_t *data;
3179 {
3180 struct jblkrec *rec;
3181
3182 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3183 rec = (struct jblkrec *)data;
3184 rec->jb_op = JOP_FREEBLK;
3185 rec->jb_ino = jfreeblk->jf_ino;
3186 rec->jb_blkno = jfreeblk->jf_blkno;
3187 rec->jb_lbn = jfreeblk->jf_lbn;
3188 rec->jb_frags = jfreeblk->jf_frags;
3189 rec->jb_oldfrags = 0;
3190 }
3191
3192 static void
3193 jfreefrag_write(jfreefrag, jseg, data)
3194 struct jfreefrag *jfreefrag;
3195 struct jseg *jseg;
3196 uint8_t *data;
3197 {
3198 struct jblkrec *rec;
3199
3200 jfreefrag->fr_jsegdep->jd_seg = jseg;
3201 rec = (struct jblkrec *)data;
3202 rec->jb_op = JOP_FREEBLK;
3203 rec->jb_ino = jfreefrag->fr_ino;
3204 rec->jb_blkno = jfreefrag->fr_blkno;
3205 rec->jb_lbn = jfreefrag->fr_lbn;
3206 rec->jb_frags = jfreefrag->fr_frags;
3207 rec->jb_oldfrags = 0;
3208 }
3209
3210 static void
3211 jtrunc_write(jtrunc, jseg, data)
3212 struct jtrunc *jtrunc;
3213 struct jseg *jseg;
3214 uint8_t *data;
3215 {
3216 struct jtrncrec *rec;
3217
3218 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3219 rec = (struct jtrncrec *)data;
3220 rec->jt_op = JOP_TRUNC;
3221 rec->jt_ino = jtrunc->jt_ino;
3222 rec->jt_size = jtrunc->jt_size;
3223 rec->jt_extsize = jtrunc->jt_extsize;
3224 }
3225
3226 static void
3227 jfsync_write(jfsync, jseg, data)
3228 struct jfsync *jfsync;
3229 struct jseg *jseg;
3230 uint8_t *data;
3231 {
3232 struct jtrncrec *rec;
3233
3234 rec = (struct jtrncrec *)data;
3235 rec->jt_op = JOP_SYNC;
3236 rec->jt_ino = jfsync->jfs_ino;
3237 rec->jt_size = jfsync->jfs_size;
3238 rec->jt_extsize = jfsync->jfs_extsize;
3239 }
3240
3241 static void
3242 softdep_flushjournal(mp)
3243 struct mount *mp;
3244 {
3245 struct jblocks *jblocks;
3246 struct ufsmount *ump;
3247
3248 if (MOUNTEDSUJ(mp) == 0)
3249 return;
3250 ump = VFSTOUFS(mp);
3251 jblocks = ump->softdep_jblocks;
3252 ACQUIRE_LOCK(ump);
3253 while (ump->softdep_on_journal) {
3254 jblocks->jb_needseg = 1;
3255 softdep_process_journal(mp, NULL, MNT_WAIT);
3256 }
3257 FREE_LOCK(ump);
3258 }
3259
3260 static void softdep_synchronize_completed(struct bio *);
3261 static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3262
3263 static void
3264 softdep_synchronize_completed(bp)
3265 struct bio *bp;
3266 {
3267 struct jseg *oldest;
3268 struct jseg *jseg;
3269 struct ufsmount *ump;
3270
3271 /*
3272 * caller1 marks the last segment written before we issued the
3273 * synchronize cache.
3274 */
3275 jseg = bp->bio_caller1;
3276 if (jseg == NULL) {
3277 g_destroy_bio(bp);
3278 return;
3279 }
3280 ump = VFSTOUFS(jseg->js_list.wk_mp);
3281 ACQUIRE_LOCK(ump);
3282 oldest = NULL;
3283 /*
3284 * Mark all the journal entries waiting on the synchronize cache
3285 * as completed so they may continue on.
3286 */
3287 while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3288 jseg->js_state |= COMPLETE;
3289 oldest = jseg;
3290 jseg = TAILQ_PREV(jseg, jseglst, js_next);
3291 }
3292 /*
3293 * Restart deferred journal entry processing from the oldest
3294 * completed jseg.
3295 */
3296 if (oldest)
3297 complete_jsegs(oldest);
3298
3299 FREE_LOCK(ump);
3300 g_destroy_bio(bp);
3301 }
3302
3303 /*
3304 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3305 * barriers. The journal must be written prior to any blocks that depend
3306 * on it and the journal can not be released until the blocks have be
3307 * written. This code handles both barriers simultaneously.
3308 */
3309 static void
3310 softdep_synchronize(bp, ump, caller1)
3311 struct bio *bp;
3312 struct ufsmount *ump;
3313 void *caller1;
3314 {
3315
3316 bp->bio_cmd = BIO_FLUSH;
3317 bp->bio_flags |= BIO_ORDERED;
3318 bp->bio_data = NULL;
3319 bp->bio_offset = ump->um_cp->provider->mediasize;
3320 bp->bio_length = 0;
3321 bp->bio_done = softdep_synchronize_completed;
3322 bp->bio_caller1 = caller1;
3323 g_io_request(bp,
3324 (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3325 }
3326
3327 /*
3328 * Flush some journal records to disk.
3329 */
3330 static void
3331 softdep_process_journal(mp, needwk, flags)
3332 struct mount *mp;
3333 struct worklist *needwk;
3334 int flags;
3335 {
3336 struct jblocks *jblocks;
3337 struct ufsmount *ump;
3338 struct worklist *wk;
3339 struct jseg *jseg;
3340 struct buf *bp;
3341 struct bio *bio;
3342 uint8_t *data;
3343 struct fs *fs;
3344 int shouldflush;
3345 int segwritten;
3346 int jrecmin; /* Minimum records per block. */
3347 int jrecmax; /* Maximum records per block. */
3348 int size;
3349 int cnt;
3350 int off;
3351 int devbsize;
3352
3353 if (MOUNTEDSUJ(mp) == 0)
3354 return;
3355 shouldflush = softdep_flushcache;
3356 bio = NULL;
3357 jseg = NULL;
3358 ump = VFSTOUFS(mp);
3359 LOCK_OWNED(ump);
3360 fs = ump->um_fs;
3361 jblocks = ump->softdep_jblocks;
3362 devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3363 /*
3364 * We write anywhere between a disk block and fs block. The upper
3365 * bound is picked to prevent buffer cache fragmentation and limit
3366 * processing time per I/O.
3367 */
3368 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3369 jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3370 segwritten = 0;
3371 for (;;) {
3372 cnt = ump->softdep_on_journal;
3373 /*
3374 * Criteria for writing a segment:
3375 * 1) We have a full block.
3376 * 2) We're called from jwait() and haven't found the
3377 * journal item yet.
3378 * 3) Always write if needseg is set.
3379 * 4) If we are called from process_worklist and have
3380 * not yet written anything we write a partial block
3381 * to enforce a 1 second maximum latency on journal
3382 * entries.
3383 */
3384 if (cnt < (jrecmax - 1) && needwk == NULL &&
3385 jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3386 break;
3387 cnt++;
3388 /*
3389 * Verify some free journal space. softdep_prealloc() should
3390 * guarantee that we don't run out so this is indicative of
3391 * a problem with the flow control. Try to recover
3392 * gracefully in any event.
3393 */
3394 while (jblocks->jb_free == 0) {
3395 if (flags != MNT_WAIT)
3396 break;
3397 printf("softdep: Out of journal space!\n");
3398 softdep_speedup(ump);
3399 msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3400 }
3401 FREE_LOCK(ump);
3402 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3403 workitem_alloc(&jseg->js_list, D_JSEG, mp);
3404 LIST_INIT(&jseg->js_entries);
3405 LIST_INIT(&jseg->js_indirs);
3406 jseg->js_state = ATTACHED;
3407 if (shouldflush == 0)
3408 jseg->js_state |= COMPLETE;
3409 else if (bio == NULL)
3410 bio = g_alloc_bio();
3411 jseg->js_jblocks = jblocks;
3412 bp = geteblk(fs->fs_bsize, 0);
3413 ACQUIRE_LOCK(ump);
3414 /*
3415 * If there was a race while we were allocating the block
3416 * and jseg the entry we care about was likely written.
3417 * We bail out in both the WAIT and NOWAIT case and assume
3418 * the caller will loop if the entry it cares about is
3419 * not written.
3420 */
3421 cnt = ump->softdep_on_journal;
3422 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3423 bp->b_flags |= B_INVAL | B_NOCACHE;
3424 WORKITEM_FREE(jseg, D_JSEG);
3425 FREE_LOCK(ump);
3426 brelse(bp);
3427 ACQUIRE_LOCK(ump);
3428 break;
3429 }
3430 /*
3431 * Calculate the disk block size required for the available
3432 * records rounded to the min size.
3433 */
3434 if (cnt == 0)
3435 size = devbsize;
3436 else if (cnt < jrecmax)
3437 size = howmany(cnt, jrecmin) * devbsize;
3438 else
3439 size = fs->fs_bsize;
3440 /*
3441 * Allocate a disk block for this journal data and account
3442 * for truncation of the requested size if enough contiguous
3443 * space was not available.
3444 */
3445 bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3446 bp->b_lblkno = bp->b_blkno;
3447 bp->b_offset = bp->b_blkno * DEV_BSIZE;
3448 bp->b_bcount = size;
3449 bp->b_flags &= ~B_INVAL;
3450 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3451 /*
3452 * Initialize our jseg with cnt records. Assign the next
3453 * sequence number to it and link it in-order.
3454 */
3455 cnt = MIN(cnt, (size / devbsize) * jrecmin);
3456 jseg->js_buf = bp;
3457 jseg->js_cnt = cnt;
3458 jseg->js_refs = cnt + 1; /* Self ref. */
3459 jseg->js_size = size;
3460 jseg->js_seq = jblocks->jb_nextseq++;
3461 if (jblocks->jb_oldestseg == NULL)
3462 jblocks->jb_oldestseg = jseg;
3463 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3464 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3465 if (jblocks->jb_writeseg == NULL)
3466 jblocks->jb_writeseg = jseg;
3467 /*
3468 * Start filling in records from the pending list.
3469 */
3470 data = bp->b_data;
3471 off = 0;
3472
3473 /*
3474 * Always put a header on the first block.
3475 * XXX As with below, there might not be a chance to get
3476 * into the loop. Ensure that something valid is written.
3477 */
3478 jseg_write(ump, jseg, data);
3479 off += JREC_SIZE;
3480 data = bp->b_data + off;
3481
3482 /*
3483 * XXX Something is wrong here. There's no work to do,
3484 * but we need to perform and I/O and allow it to complete
3485 * anyways.
3486 */
3487 if (LIST_EMPTY(&ump->softdep_journal_pending))
3488 stat_emptyjblocks++;
3489
3490 while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3491 != NULL) {
3492 if (cnt == 0)
3493 break;
3494 /* Place a segment header on every device block. */
3495 if ((off % devbsize) == 0) {
3496 jseg_write(ump, jseg, data);
3497 off += JREC_SIZE;
3498 data = bp->b_data + off;
3499 }
3500 if (wk == needwk)
3501 needwk = NULL;
3502 remove_from_journal(wk);
3503 wk->wk_state |= INPROGRESS;
3504 WORKLIST_INSERT(&jseg->js_entries, wk);
3505 switch (wk->wk_type) {
3506 case D_JADDREF:
3507 jaddref_write(WK_JADDREF(wk), jseg, data);
3508 break;
3509 case D_JREMREF:
3510 jremref_write(WK_JREMREF(wk), jseg, data);
3511 break;
3512 case D_JMVREF:
3513 jmvref_write(WK_JMVREF(wk), jseg, data);
3514 break;
3515 case D_JNEWBLK:
3516 jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3517 break;
3518 case D_JFREEBLK:
3519 jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3520 break;
3521 case D_JFREEFRAG:
3522 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3523 break;
3524 case D_JTRUNC:
3525 jtrunc_write(WK_JTRUNC(wk), jseg, data);
3526 break;
3527 case D_JFSYNC:
3528 jfsync_write(WK_JFSYNC(wk), jseg, data);
3529 break;
3530 default:
3531 panic("process_journal: Unknown type %s",
3532 TYPENAME(wk->wk_type));
3533 /* NOTREACHED */
3534 }
3535 off += JREC_SIZE;
3536 data = bp->b_data + off;
3537 cnt--;
3538 }
3539
3540 /* Clear any remaining space so we don't leak kernel data */
3541 if (size > off)
3542 bzero(data, size - off);
3543
3544 /*
3545 * Write this one buffer and continue.
3546 */
3547 segwritten = 1;
3548 jblocks->jb_needseg = 0;
3549 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3550 FREE_LOCK(ump);
3551 pbgetvp(ump->um_devvp, bp);
3552 /*
3553 * We only do the blocking wait once we find the journal
3554 * entry we're looking for.
3555 */
3556 if (needwk == NULL && flags == MNT_WAIT)
3557 bwrite(bp);
3558 else
3559 bawrite(bp);
3560 ACQUIRE_LOCK(ump);
3561 }
3562 /*
3563 * If we wrote a segment issue a synchronize cache so the journal
3564 * is reflected on disk before the data is written. Since reclaiming
3565 * journal space also requires writing a journal record this
3566 * process also enforces a barrier before reclamation.
3567 */
3568 if (segwritten && shouldflush) {
3569 softdep_synchronize(bio, ump,
3570 TAILQ_LAST(&jblocks->jb_segs, jseglst));
3571 } else if (bio)
3572 g_destroy_bio(bio);
3573 /*
3574 * If we've suspended the filesystem because we ran out of journal
3575 * space either try to sync it here to make some progress or
3576 * unsuspend it if we already have.
3577 */
3578 if (flags == 0 && jblocks->jb_suspended) {
3579 if (journal_unsuspend(ump))
3580 return;
3581 FREE_LOCK(ump);
3582 VFS_SYNC(mp, MNT_NOWAIT);
3583 ffs_sbupdate(ump, MNT_WAIT, 0);
3584 ACQUIRE_LOCK(ump);
3585 }
3586 }
3587
3588 /*
3589 * Complete a jseg, allowing all dependencies awaiting journal writes
3590 * to proceed. Each journal dependency also attaches a jsegdep to dependent
3591 * structures so that the journal segment can be freed to reclaim space.
3592 */
3593 static void
3594 complete_jseg(jseg)
3595 struct jseg *jseg;
3596 {
3597 struct worklist *wk;
3598 struct jmvref *jmvref;
3599 #ifdef INVARIANTS
3600 int i = 0;
3601 #endif
3602
3603 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3604 WORKLIST_REMOVE(wk);
3605 wk->wk_state &= ~INPROGRESS;
3606 wk->wk_state |= COMPLETE;
3607 KASSERT(i++ < jseg->js_cnt,
3608 ("handle_written_jseg: overflow %d >= %d",
3609 i - 1, jseg->js_cnt));
3610 switch (wk->wk_type) {
3611 case D_JADDREF:
3612 handle_written_jaddref(WK_JADDREF(wk));
3613 break;
3614 case D_JREMREF:
3615 handle_written_jremref(WK_JREMREF(wk));
3616 break;
3617 case D_JMVREF:
3618 rele_jseg(jseg); /* No jsegdep. */
3619 jmvref = WK_JMVREF(wk);
3620 LIST_REMOVE(jmvref, jm_deps);
3621 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3622 free_pagedep(jmvref->jm_pagedep);
3623 WORKITEM_FREE(jmvref, D_JMVREF);
3624 break;
3625 case D_JNEWBLK:
3626 handle_written_jnewblk(WK_JNEWBLK(wk));
3627 break;
3628 case D_JFREEBLK:
3629 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3630 break;
3631 case D_JTRUNC:
3632 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3633 break;
3634 case D_JFSYNC:
3635 rele_jseg(jseg); /* No jsegdep. */
3636 WORKITEM_FREE(wk, D_JFSYNC);
3637 break;
3638 case D_JFREEFRAG:
3639 handle_written_jfreefrag(WK_JFREEFRAG(wk));
3640 break;
3641 default:
3642 panic("handle_written_jseg: Unknown type %s",
3643 TYPENAME(wk->wk_type));
3644 /* NOTREACHED */
3645 }
3646 }
3647 /* Release the self reference so the structure may be freed. */
3648 rele_jseg(jseg);
3649 }
3650
3651 /*
3652 * Determine which jsegs are ready for completion processing. Waits for
3653 * synchronize cache to complete as well as forcing in-order completion
3654 * of journal entries.
3655 */
3656 static void
3657 complete_jsegs(jseg)
3658 struct jseg *jseg;
3659 {
3660 struct jblocks *jblocks;
3661 struct jseg *jsegn;
3662
3663 jblocks = jseg->js_jblocks;
3664 /*
3665 * Don't allow out of order completions. If this isn't the first
3666 * block wait for it to write before we're done.
3667 */
3668 if (jseg != jblocks->jb_writeseg)
3669 return;
3670 /* Iterate through available jsegs processing their entries. */
3671 while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3672 jblocks->jb_oldestwrseq = jseg->js_oldseq;
3673 jsegn = TAILQ_NEXT(jseg, js_next);
3674 complete_jseg(jseg);
3675 jseg = jsegn;
3676 }
3677 jblocks->jb_writeseg = jseg;
3678 /*
3679 * Attempt to free jsegs now that oldestwrseq may have advanced.
3680 */
3681 free_jsegs(jblocks);
3682 }
3683
3684 /*
3685 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle
3686 * the final completions.
3687 */
3688 static void
3689 handle_written_jseg(jseg, bp)
3690 struct jseg *jseg;
3691 struct buf *bp;
3692 {
3693
3694 if (jseg->js_refs == 0)
3695 panic("handle_written_jseg: No self-reference on %p", jseg);
3696 jseg->js_state |= DEPCOMPLETE;
3697 /*
3698 * We'll never need this buffer again, set flags so it will be
3699 * discarded.
3700 */
3701 bp->b_flags |= B_INVAL | B_NOCACHE;
3702 pbrelvp(bp);
3703 complete_jsegs(jseg);
3704 }
3705
3706 static inline struct jsegdep *
3707 inoref_jseg(inoref)
3708 struct inoref *inoref;
3709 {
3710 struct jsegdep *jsegdep;
3711
3712 jsegdep = inoref->if_jsegdep;
3713 inoref->if_jsegdep = NULL;
3714
3715 return (jsegdep);
3716 }
3717
3718 /*
3719 * Called once a jremref has made it to stable store. The jremref is marked
3720 * complete and we attempt to free it. Any pagedeps writes sleeping waiting
3721 * for the jremref to complete will be awoken by free_jremref.
3722 */
3723 static void
3724 handle_written_jremref(jremref)
3725 struct jremref *jremref;
3726 {
3727 struct inodedep *inodedep;
3728 struct jsegdep *jsegdep;
3729 struct dirrem *dirrem;
3730
3731 /* Grab the jsegdep. */
3732 jsegdep = inoref_jseg(&jremref->jr_ref);
3733 /*
3734 * Remove us from the inoref list.
3735 */
3736 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3737 0, &inodedep) == 0)
3738 panic("handle_written_jremref: Lost inodedep");
3739 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3740 /*
3741 * Complete the dirrem.
3742 */
3743 dirrem = jremref->jr_dirrem;
3744 jremref->jr_dirrem = NULL;
3745 LIST_REMOVE(jremref, jr_deps);
3746 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3747 jwork_insert(&dirrem->dm_jwork, jsegdep);
3748 if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3749 (dirrem->dm_state & COMPLETE) != 0)
3750 add_to_worklist(&dirrem->dm_list, 0);
3751 free_jremref(jremref);
3752 }
3753
3754 /*
3755 * Called once a jaddref has made it to stable store. The dependency is
3756 * marked complete and any dependent structures are added to the inode
3757 * bufwait list to be completed as soon as it is written. If a bitmap write
3758 * depends on this entry we move the inode into the inodedephd of the
3759 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3760 */
3761 static void
3762 handle_written_jaddref(jaddref)
3763 struct jaddref *jaddref;
3764 {
3765 struct jsegdep *jsegdep;
3766 struct inodedep *inodedep;
3767 struct diradd *diradd;
3768 struct mkdir *mkdir;
3769
3770 /* Grab the jsegdep. */
3771 jsegdep = inoref_jseg(&jaddref->ja_ref);
3772 mkdir = NULL;
3773 diradd = NULL;
3774 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3775 0, &inodedep) == 0)
3776 panic("handle_written_jaddref: Lost inodedep.");
3777 if (jaddref->ja_diradd == NULL)
3778 panic("handle_written_jaddref: No dependency");
3779 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3780 diradd = jaddref->ja_diradd;
3781 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3782 } else if (jaddref->ja_state & MKDIR_PARENT) {
3783 mkdir = jaddref->ja_mkdir;
3784 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3785 } else if (jaddref->ja_state & MKDIR_BODY)
3786 mkdir = jaddref->ja_mkdir;
3787 else
3788 panic("handle_written_jaddref: Unknown dependency %p",
3789 jaddref->ja_diradd);
3790 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */
3791 /*
3792 * Remove us from the inode list.
3793 */
3794 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3795 /*
3796 * The mkdir may be waiting on the jaddref to clear before freeing.
3797 */
3798 if (mkdir) {
3799 KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3800 ("handle_written_jaddref: Incorrect type for mkdir %s",
3801 TYPENAME(mkdir->md_list.wk_type)));
3802 mkdir->md_jaddref = NULL;
3803 diradd = mkdir->md_diradd;
3804 mkdir->md_state |= DEPCOMPLETE;
3805 complete_mkdir(mkdir);
3806 }
3807 jwork_insert(&diradd->da_jwork, jsegdep);
3808 if (jaddref->ja_state & NEWBLOCK) {
3809 inodedep->id_state |= ONDEPLIST;
3810 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3811 inodedep, id_deps);
3812 }
3813 free_jaddref(jaddref);
3814 }
3815
3816 /*
3817 * Called once a jnewblk journal is written. The allocdirect or allocindir
3818 * is placed in the bmsafemap to await notification of a written bitmap. If
3819 * the operation was canceled we add the segdep to the appropriate
3820 * dependency to free the journal space once the canceling operation
3821 * completes.
3822 */
3823 static void
3824 handle_written_jnewblk(jnewblk)
3825 struct jnewblk *jnewblk;
3826 {
3827 struct bmsafemap *bmsafemap;
3828 struct freefrag *freefrag;
3829 struct freework *freework;
3830 struct jsegdep *jsegdep;
3831 struct newblk *newblk;
3832
3833 /* Grab the jsegdep. */
3834 jsegdep = jnewblk->jn_jsegdep;
3835 jnewblk->jn_jsegdep = NULL;
3836 if (jnewblk->jn_dep == NULL)
3837 panic("handle_written_jnewblk: No dependency for the segdep.");
3838 switch (jnewblk->jn_dep->wk_type) {
3839 case D_NEWBLK:
3840 case D_ALLOCDIRECT:
3841 case D_ALLOCINDIR:
3842 /*
3843 * Add the written block to the bmsafemap so it can
3844 * be notified when the bitmap is on disk.
3845 */
3846 newblk = WK_NEWBLK(jnewblk->jn_dep);
3847 newblk->nb_jnewblk = NULL;
3848 if ((newblk->nb_state & GOINGAWAY) == 0) {
3849 bmsafemap = newblk->nb_bmsafemap;
3850 newblk->nb_state |= ONDEPLIST;
3851 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3852 nb_deps);
3853 }
3854 jwork_insert(&newblk->nb_jwork, jsegdep);
3855 break;
3856 case D_FREEFRAG:
3857 /*
3858 * A newblock being removed by a freefrag when replaced by
3859 * frag extension.
3860 */
3861 freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3862 freefrag->ff_jdep = NULL;
3863 jwork_insert(&freefrag->ff_jwork, jsegdep);
3864 break;
3865 case D_FREEWORK:
3866 /*
3867 * A direct block was removed by truncate.
3868 */
3869 freework = WK_FREEWORK(jnewblk->jn_dep);
3870 freework->fw_jnewblk = NULL;
3871 jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3872 break;
3873 default:
3874 panic("handle_written_jnewblk: Unknown type %d.",
3875 jnewblk->jn_dep->wk_type);
3876 }
3877 jnewblk->jn_dep = NULL;
3878 free_jnewblk(jnewblk);
3879 }
3880
3881 /*
3882 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3883 * an in-flight allocation that has not yet been committed. Divorce us
3884 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3885 * to the worklist.
3886 */
3887 static void
3888 cancel_jfreefrag(jfreefrag)
3889 struct jfreefrag *jfreefrag;
3890 {
3891 struct freefrag *freefrag;
3892
3893 if (jfreefrag->fr_jsegdep) {
3894 free_jsegdep(jfreefrag->fr_jsegdep);
3895 jfreefrag->fr_jsegdep = NULL;
3896 }
3897 freefrag = jfreefrag->fr_freefrag;
3898 jfreefrag->fr_freefrag = NULL;
3899 free_jfreefrag(jfreefrag);
3900 freefrag->ff_state |= DEPCOMPLETE;
3901 CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3902 }
3903
3904 /*
3905 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3906 */
3907 static void
3908 free_jfreefrag(jfreefrag)
3909 struct jfreefrag *jfreefrag;
3910 {
3911
3912 if (jfreefrag->fr_state & INPROGRESS)
3913 WORKLIST_REMOVE(&jfreefrag->fr_list);
3914 else if (jfreefrag->fr_state & ONWORKLIST)
3915 remove_from_journal(&jfreefrag->fr_list);
3916 if (jfreefrag->fr_freefrag != NULL)
3917 panic("free_jfreefrag: Still attached to a freefrag.");
3918 WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3919 }
3920
3921 /*
3922 * Called when the journal write for a jfreefrag completes. The parent
3923 * freefrag is added to the worklist if this completes its dependencies.
3924 */
3925 static void
3926 handle_written_jfreefrag(jfreefrag)
3927 struct jfreefrag *jfreefrag;
3928 {
3929 struct jsegdep *jsegdep;
3930 struct freefrag *freefrag;
3931
3932 /* Grab the jsegdep. */
3933 jsegdep = jfreefrag->fr_jsegdep;
3934 jfreefrag->fr_jsegdep = NULL;
3935 freefrag = jfreefrag->fr_freefrag;
3936 if (freefrag == NULL)
3937 panic("handle_written_jfreefrag: No freefrag.");
3938 freefrag->ff_state |= DEPCOMPLETE;
3939 freefrag->ff_jdep = NULL;
3940 jwork_insert(&freefrag->ff_jwork, jsegdep);
3941 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3942 add_to_worklist(&freefrag->ff_list, 0);
3943 jfreefrag->fr_freefrag = NULL;
3944 free_jfreefrag(jfreefrag);
3945 }
3946
3947 /*
3948 * Called when the journal write for a jfreeblk completes. The jfreeblk
3949 * is removed from the freeblks list of pending journal writes and the
3950 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3951 * have been reclaimed.
3952 */
3953 static void
3954 handle_written_jblkdep(jblkdep)
3955 struct jblkdep *jblkdep;
3956 {
3957 struct freeblks *freeblks;
3958 struct jsegdep *jsegdep;
3959
3960 /* Grab the jsegdep. */
3961 jsegdep = jblkdep->jb_jsegdep;
3962 jblkdep->jb_jsegdep = NULL;
3963 freeblks = jblkdep->jb_freeblks;
3964 LIST_REMOVE(jblkdep, jb_deps);
3965 jwork_insert(&freeblks->fb_jwork, jsegdep);
3966 /*
3967 * If the freeblks is all journaled, we can add it to the worklist.
3968 */
3969 if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3970 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3971 add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3972
3973 free_jblkdep(jblkdep);
3974 }
3975
3976 static struct jsegdep *
3977 newjsegdep(struct worklist *wk)
3978 {
3979 struct jsegdep *jsegdep;
3980
3981 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3982 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3983 jsegdep->jd_seg = NULL;
3984
3985 return (jsegdep);
3986 }
3987
3988 static struct jmvref *
3989 newjmvref(dp, ino, oldoff, newoff)
3990 struct inode *dp;
3991 ino_t ino;
3992 off_t oldoff;
3993 off_t newoff;
3994 {
3995 struct jmvref *jmvref;
3996
3997 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3998 workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
3999 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4000 jmvref->jm_parent = dp->i_number;
4001 jmvref->jm_ino = ino;
4002 jmvref->jm_oldoff = oldoff;
4003 jmvref->jm_newoff = newoff;
4004
4005 return (jmvref);
4006 }
4007
4008 /*
4009 * Allocate a new jremref that tracks the removal of ip from dp with the
4010 * directory entry offset of diroff. Mark the entry as ATTACHED and
4011 * DEPCOMPLETE as we have all the information required for the journal write
4012 * and the directory has already been removed from the buffer. The caller
4013 * is responsible for linking the jremref into the pagedep and adding it
4014 * to the journal to write. The MKDIR_PARENT flag is set if we're doing
4015 * a DOTDOT addition so handle_workitem_remove() can properly assign
4016 * the jsegdep when we're done.
4017 */
4018 static struct jremref *
4019 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4020 off_t diroff, nlink_t nlink)
4021 {
4022 struct jremref *jremref;
4023
4024 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4025 workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4026 jremref->jr_state = ATTACHED;
4027 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4028 nlink, ip->i_mode);
4029 jremref->jr_dirrem = dirrem;
4030
4031 return (jremref);
4032 }
4033
4034 static inline void
4035 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4036 nlink_t nlink, uint16_t mode)
4037 {
4038
4039 inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4040 inoref->if_diroff = diroff;
4041 inoref->if_ino = ino;
4042 inoref->if_parent = parent;
4043 inoref->if_nlink = nlink;
4044 inoref->if_mode = mode;
4045 }
4046
4047 /*
4048 * Allocate a new jaddref to track the addition of ino to dp at diroff. The
4049 * directory offset may not be known until later. The caller is responsible
4050 * adding the entry to the journal when this information is available. nlink
4051 * should be the link count prior to the addition and mode is only required
4052 * to have the correct FMT.
4053 */
4054 static struct jaddref *
4055 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4056 uint16_t mode)
4057 {
4058 struct jaddref *jaddref;
4059
4060 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4061 workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4062 jaddref->ja_state = ATTACHED;
4063 jaddref->ja_mkdir = NULL;
4064 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4065
4066 return (jaddref);
4067 }
4068
4069 /*
4070 * Create a new free dependency for a freework. The caller is responsible
4071 * for adjusting the reference count when it has the lock held. The freedep
4072 * will track an outstanding bitmap write that will ultimately clear the
4073 * freework to continue.
4074 */
4075 static struct freedep *
4076 newfreedep(struct freework *freework)
4077 {
4078 struct freedep *freedep;
4079
4080 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4081 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4082 freedep->fd_freework = freework;
4083
4084 return (freedep);
4085 }
4086
4087 /*
4088 * Free a freedep structure once the buffer it is linked to is written. If
4089 * this is the last reference to the freework schedule it for completion.
4090 */
4091 static void
4092 free_freedep(freedep)
4093 struct freedep *freedep;
4094 {
4095 struct freework *freework;
4096
4097 freework = freedep->fd_freework;
4098 freework->fw_freeblks->fb_cgwait--;
4099 if (--freework->fw_ref == 0)
4100 freework_enqueue(freework);
4101 WORKITEM_FREE(freedep, D_FREEDEP);
4102 }
4103
4104 /*
4105 * Allocate a new freework structure that may be a level in an indirect
4106 * when parent is not NULL or a top level block when it is. The top level
4107 * freework structures are allocated without the per-filesystem lock held
4108 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4109 */
4110 static struct freework *
4111 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4112 struct ufsmount *ump;
4113 struct freeblks *freeblks;
4114 struct freework *parent;
4115 ufs_lbn_t lbn;
4116 ufs2_daddr_t nb;
4117 int frags;
4118 int off;
4119 int journal;
4120 {
4121 struct freework *freework;
4122
4123 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4124 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4125 freework->fw_state = ATTACHED;
4126 freework->fw_jnewblk = NULL;
4127 freework->fw_freeblks = freeblks;
4128 freework->fw_parent = parent;
4129 freework->fw_lbn = lbn;
4130 freework->fw_blkno = nb;
4131 freework->fw_frags = frags;
4132 freework->fw_indir = NULL;
4133 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4134 ? 0 : NINDIR(ump->um_fs) + 1;
4135 freework->fw_start = freework->fw_off = off;
4136 if (journal)
4137 newjfreeblk(freeblks, lbn, nb, frags);
4138 if (parent == NULL) {
4139 ACQUIRE_LOCK(ump);
4140 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4141 freeblks->fb_ref++;
4142 FREE_LOCK(ump);
4143 }
4144
4145 return (freework);
4146 }
4147
4148 /*
4149 * Eliminate a jfreeblk for a block that does not need journaling.
4150 */
4151 static void
4152 cancel_jfreeblk(freeblks, blkno)
4153 struct freeblks *freeblks;
4154 ufs2_daddr_t blkno;
4155 {
4156 struct jfreeblk *jfreeblk;
4157 struct jblkdep *jblkdep;
4158
4159 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4160 if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4161 continue;
4162 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4163 if (jfreeblk->jf_blkno == blkno)
4164 break;
4165 }
4166 if (jblkdep == NULL)
4167 return;
4168 CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4169 free_jsegdep(jblkdep->jb_jsegdep);
4170 LIST_REMOVE(jblkdep, jb_deps);
4171 WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4172 }
4173
4174 /*
4175 * Allocate a new jfreeblk to journal top level block pointer when truncating
4176 * a file. The caller must add this to the worklist when the per-filesystem
4177 * lock is held.
4178 */
4179 static struct jfreeblk *
4180 newjfreeblk(freeblks, lbn, blkno, frags)
4181 struct freeblks *freeblks;
4182 ufs_lbn_t lbn;
4183 ufs2_daddr_t blkno;
4184 int frags;
4185 {
4186 struct jfreeblk *jfreeblk;
4187
4188 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4189 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4190 freeblks->fb_list.wk_mp);
4191 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4192 jfreeblk->jf_dep.jb_freeblks = freeblks;
4193 jfreeblk->jf_ino = freeblks->fb_inum;
4194 jfreeblk->jf_lbn = lbn;
4195 jfreeblk->jf_blkno = blkno;
4196 jfreeblk->jf_frags = frags;
4197 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4198
4199 return (jfreeblk);
4200 }
4201
4202 /*
4203 * The journal is only prepared to handle full-size block numbers, so we
4204 * have to adjust the record to reflect the change to a full-size block.
4205 * For example, suppose we have a block made up of fragments 8-15 and
4206 * want to free its last two fragments. We are given a request that says:
4207 * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4208 * where frags are the number of fragments to free and oldfrags are the
4209 * number of fragments to keep. To block align it, we have to change it to
4210 * have a valid full-size blkno, so it becomes:
4211 * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4212 */
4213 static void
4214 adjust_newfreework(freeblks, frag_offset)
4215 struct freeblks *freeblks;
4216 int frag_offset;
4217 {
4218 struct jfreeblk *jfreeblk;
4219
4220 KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4221 LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4222 ("adjust_newfreework: Missing freeblks dependency"));
4223
4224 jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4225 jfreeblk->jf_blkno -= frag_offset;
4226 jfreeblk->jf_frags += frag_offset;
4227 }
4228
4229 /*
4230 * Allocate a new jtrunc to track a partial truncation.
4231 */
4232 static struct jtrunc *
4233 newjtrunc(freeblks, size, extsize)
4234 struct freeblks *freeblks;
4235 off_t size;
4236 int extsize;
4237 {
4238 struct jtrunc *jtrunc;
4239
4240 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4241 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4242 freeblks->fb_list.wk_mp);
4243 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4244 jtrunc->jt_dep.jb_freeblks = freeblks;
4245 jtrunc->jt_ino = freeblks->fb_inum;
4246 jtrunc->jt_size = size;
4247 jtrunc->jt_extsize = extsize;
4248 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4249
4250 return (jtrunc);
4251 }
4252
4253 /*
4254 * If we're canceling a new bitmap we have to search for another ref
4255 * to move into the bmsafemap dep. This might be better expressed
4256 * with another structure.
4257 */
4258 static void
4259 move_newblock_dep(jaddref, inodedep)
4260 struct jaddref *jaddref;
4261 struct inodedep *inodedep;
4262 {
4263 struct inoref *inoref;
4264 struct jaddref *jaddrefn;
4265
4266 jaddrefn = NULL;
4267 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4268 inoref = TAILQ_NEXT(inoref, if_deps)) {
4269 if ((jaddref->ja_state & NEWBLOCK) &&
4270 inoref->if_list.wk_type == D_JADDREF) {
4271 jaddrefn = (struct jaddref *)inoref;
4272 break;
4273 }
4274 }
4275 if (jaddrefn == NULL)
4276 return;
4277 jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4278 jaddrefn->ja_state |= jaddref->ja_state &
4279 (ATTACHED | UNDONE | NEWBLOCK);
4280 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4281 jaddref->ja_state |= ATTACHED;
4282 LIST_REMOVE(jaddref, ja_bmdeps);
4283 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4284 ja_bmdeps);
4285 }
4286
4287 /*
4288 * Cancel a jaddref either before it has been written or while it is being
4289 * written. This happens when a link is removed before the add reaches
4290 * the disk. The jaddref dependency is kept linked into the bmsafemap
4291 * and inode to prevent the link count or bitmap from reaching the disk
4292 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4293 * required.
4294 *
4295 * Returns 1 if the canceled addref requires journaling of the remove and
4296 * 0 otherwise.
4297 */
4298 static int
4299 cancel_jaddref(jaddref, inodedep, wkhd)
4300 struct jaddref *jaddref;
4301 struct inodedep *inodedep;
4302 struct workhead *wkhd;
4303 {
4304 struct inoref *inoref;
4305 struct jsegdep *jsegdep;
4306 int needsj;
4307
4308 KASSERT((jaddref->ja_state & COMPLETE) == 0,
4309 ("cancel_jaddref: Canceling complete jaddref"));
4310 if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4311 needsj = 1;
4312 else
4313 needsj = 0;
4314 if (inodedep == NULL)
4315 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4316 0, &inodedep) == 0)
4317 panic("cancel_jaddref: Lost inodedep");
4318 /*
4319 * We must adjust the nlink of any reference operation that follows
4320 * us so that it is consistent with the in-memory reference. This
4321 * ensures that inode nlink rollbacks always have the correct link.
4322 */
4323 if (needsj == 0) {
4324 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4325 inoref = TAILQ_NEXT(inoref, if_deps)) {
4326 if (inoref->if_state & GOINGAWAY)
4327 break;
4328 inoref->if_nlink--;
4329 }
4330 }
4331 jsegdep = inoref_jseg(&jaddref->ja_ref);
4332 if (jaddref->ja_state & NEWBLOCK)
4333 move_newblock_dep(jaddref, inodedep);
4334 wake_worklist(&jaddref->ja_list);
4335 jaddref->ja_mkdir = NULL;
4336 if (jaddref->ja_state & INPROGRESS) {
4337 jaddref->ja_state &= ~INPROGRESS;
4338 WORKLIST_REMOVE(&jaddref->ja_list);
4339 jwork_insert(wkhd, jsegdep);
4340 } else {
4341 free_jsegdep(jsegdep);
4342 if (jaddref->ja_state & DEPCOMPLETE)
4343 remove_from_journal(&jaddref->ja_list);
4344 }
4345 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4346 /*
4347 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4348 * can arrange for them to be freed with the bitmap. Otherwise we
4349 * no longer need this addref attached to the inoreflst and it
4350 * will incorrectly adjust nlink if we leave it.
4351 */
4352 if ((jaddref->ja_state & NEWBLOCK) == 0) {
4353 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4354 if_deps);
4355 jaddref->ja_state |= COMPLETE;
4356 free_jaddref(jaddref);
4357 return (needsj);
4358 }
4359 /*
4360 * Leave the head of the list for jsegdeps for fast merging.
4361 */
4362 if (LIST_FIRST(wkhd) != NULL) {
4363 jaddref->ja_state |= ONWORKLIST;
4364 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4365 } else
4366 WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4367
4368 return (needsj);
4369 }
4370
4371 /*
4372 * Attempt to free a jaddref structure when some work completes. This
4373 * should only succeed once the entry is written and all dependencies have
4374 * been notified.
4375 */
4376 static void
4377 free_jaddref(jaddref)
4378 struct jaddref *jaddref;
4379 {
4380
4381 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4382 return;
4383 if (jaddref->ja_ref.if_jsegdep)
4384 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4385 jaddref, jaddref->ja_state);
4386 if (jaddref->ja_state & NEWBLOCK)
4387 LIST_REMOVE(jaddref, ja_bmdeps);
4388 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4389 panic("free_jaddref: Bad state %p(0x%X)",
4390 jaddref, jaddref->ja_state);
4391 if (jaddref->ja_mkdir != NULL)
4392 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4393 WORKITEM_FREE(jaddref, D_JADDREF);
4394 }
4395
4396 /*
4397 * Free a jremref structure once it has been written or discarded.
4398 */
4399 static void
4400 free_jremref(jremref)
4401 struct jremref *jremref;
4402 {
4403
4404 if (jremref->jr_ref.if_jsegdep)
4405 free_jsegdep(jremref->jr_ref.if_jsegdep);
4406 if (jremref->jr_state & INPROGRESS)
4407 panic("free_jremref: IO still pending");
4408 WORKITEM_FREE(jremref, D_JREMREF);
4409 }
4410
4411 /*
4412 * Free a jnewblk structure.
4413 */
4414 static void
4415 free_jnewblk(jnewblk)
4416 struct jnewblk *jnewblk;
4417 {
4418
4419 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4420 return;
4421 LIST_REMOVE(jnewblk, jn_deps);
4422 if (jnewblk->jn_dep != NULL)
4423 panic("free_jnewblk: Dependency still attached.");
4424 WORKITEM_FREE(jnewblk, D_JNEWBLK);
4425 }
4426
4427 /*
4428 * Cancel a jnewblk which has been been made redundant by frag extension.
4429 */
4430 static void
4431 cancel_jnewblk(jnewblk, wkhd)
4432 struct jnewblk *jnewblk;
4433 struct workhead *wkhd;
4434 {
4435 struct jsegdep *jsegdep;
4436
4437 CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4438 jsegdep = jnewblk->jn_jsegdep;
4439 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4440 panic("cancel_jnewblk: Invalid state");
4441 jnewblk->jn_jsegdep = NULL;
4442 jnewblk->jn_dep = NULL;
4443 jnewblk->jn_state |= GOINGAWAY;
4444 if (jnewblk->jn_state & INPROGRESS) {
4445 jnewblk->jn_state &= ~INPROGRESS;
4446 WORKLIST_REMOVE(&jnewblk->jn_list);
4447 jwork_insert(wkhd, jsegdep);
4448 } else {
4449 free_jsegdep(jsegdep);
4450 remove_from_journal(&jnewblk->jn_list);
4451 }
4452 wake_worklist(&jnewblk->jn_list);
4453 WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4454 }
4455
4456 static void
4457 free_jblkdep(jblkdep)
4458 struct jblkdep *jblkdep;
4459 {
4460
4461 if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4462 WORKITEM_FREE(jblkdep, D_JFREEBLK);
4463 else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4464 WORKITEM_FREE(jblkdep, D_JTRUNC);
4465 else
4466 panic("free_jblkdep: Unexpected type %s",
4467 TYPENAME(jblkdep->jb_list.wk_type));
4468 }
4469
4470 /*
4471 * Free a single jseg once it is no longer referenced in memory or on
4472 * disk. Reclaim journal blocks and dependencies waiting for the segment
4473 * to disappear.
4474 */
4475 static void
4476 free_jseg(jseg, jblocks)
4477 struct jseg *jseg;
4478 struct jblocks *jblocks;
4479 {
4480 struct freework *freework;
4481
4482 /*
4483 * Free freework structures that were lingering to indicate freed
4484 * indirect blocks that forced journal write ordering on reallocate.
4485 */
4486 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4487 indirblk_remove(freework);
4488 if (jblocks->jb_oldestseg == jseg)
4489 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4490 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4491 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4492 KASSERT(LIST_EMPTY(&jseg->js_entries),
4493 ("free_jseg: Freed jseg has valid entries."));
4494 WORKITEM_FREE(jseg, D_JSEG);
4495 }
4496
4497 /*
4498 * Free all jsegs that meet the criteria for being reclaimed and update
4499 * oldestseg.
4500 */
4501 static void
4502 free_jsegs(jblocks)
4503 struct jblocks *jblocks;
4504 {
4505 struct jseg *jseg;
4506
4507 /*
4508 * Free only those jsegs which have none allocated before them to
4509 * preserve the journal space ordering.
4510 */
4511 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4512 /*
4513 * Only reclaim space when nothing depends on this journal
4514 * set and another set has written that it is no longer
4515 * valid.
4516 */
4517 if (jseg->js_refs != 0) {
4518 jblocks->jb_oldestseg = jseg;
4519 return;
4520 }
4521 if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4522 break;
4523 if (jseg->js_seq > jblocks->jb_oldestwrseq)
4524 break;
4525 /*
4526 * We can free jsegs that didn't write entries when
4527 * oldestwrseq == js_seq.
4528 */
4529 if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4530 jseg->js_cnt != 0)
4531 break;
4532 free_jseg(jseg, jblocks);
4533 }
4534 /*
4535 * If we exited the loop above we still must discover the
4536 * oldest valid segment.
4537 */
4538 if (jseg)
4539 for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4540 jseg = TAILQ_NEXT(jseg, js_next))
4541 if (jseg->js_refs != 0)
4542 break;
4543 jblocks->jb_oldestseg = jseg;
4544 /*
4545 * The journal has no valid records but some jsegs may still be
4546 * waiting on oldestwrseq to advance. We force a small record
4547 * out to permit these lingering records to be reclaimed.
4548 */
4549 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4550 jblocks->jb_needseg = 1;
4551 }
4552
4553 /*
4554 * Release one reference to a jseg and free it if the count reaches 0. This
4555 * should eventually reclaim journal space as well.
4556 */
4557 static void
4558 rele_jseg(jseg)
4559 struct jseg *jseg;
4560 {
4561
4562 KASSERT(jseg->js_refs > 0,
4563 ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4564 if (--jseg->js_refs != 0)
4565 return;
4566 free_jsegs(jseg->js_jblocks);
4567 }
4568
4569 /*
4570 * Release a jsegdep and decrement the jseg count.
4571 */
4572 static void
4573 free_jsegdep(jsegdep)
4574 struct jsegdep *jsegdep;
4575 {
4576
4577 if (jsegdep->jd_seg)
4578 rele_jseg(jsegdep->jd_seg);
4579 WORKITEM_FREE(jsegdep, D_JSEGDEP);
4580 }
4581
4582 /*
4583 * Wait for a journal item to make it to disk. Initiate journal processing
4584 * if required.
4585 */
4586 static int
4587 jwait(wk, waitfor)
4588 struct worklist *wk;
4589 int waitfor;
4590 {
4591
4592 LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4593 /*
4594 * Blocking journal waits cause slow synchronous behavior. Record
4595 * stats on the frequency of these blocking operations.
4596 */
4597 if (waitfor == MNT_WAIT) {
4598 stat_journal_wait++;
4599 switch (wk->wk_type) {
4600 case D_JREMREF:
4601 case D_JMVREF:
4602 stat_jwait_filepage++;
4603 break;
4604 case D_JTRUNC:
4605 case D_JFREEBLK:
4606 stat_jwait_freeblks++;
4607 break;
4608 case D_JNEWBLK:
4609 stat_jwait_newblk++;
4610 break;
4611 case D_JADDREF:
4612 stat_jwait_inode++;
4613 break;
4614 default:
4615 break;
4616 }
4617 }
4618 /*
4619 * If IO has not started we process the journal. We can't mark the
4620 * worklist item as IOWAITING because we drop the lock while
4621 * processing the journal and the worklist entry may be freed after
4622 * this point. The caller may call back in and re-issue the request.
4623 */
4624 if ((wk->wk_state & INPROGRESS) == 0) {
4625 softdep_process_journal(wk->wk_mp, wk, waitfor);
4626 if (waitfor != MNT_WAIT)
4627 return (EBUSY);
4628 return (0);
4629 }
4630 if (waitfor != MNT_WAIT)
4631 return (EBUSY);
4632 wait_worklist(wk, "jwait");
4633 return (0);
4634 }
4635
4636 /*
4637 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4638 * appropriate. This is a convenience function to reduce duplicate code
4639 * for the setup and revert functions below.
4640 */
4641 static struct inodedep *
4642 inodedep_lookup_ip(ip)
4643 struct inode *ip;
4644 {
4645 struct inodedep *inodedep;
4646
4647 KASSERT(ip->i_nlink >= ip->i_effnlink,
4648 ("inodedep_lookup_ip: bad delta"));
4649 (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
4650 &inodedep);
4651 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4652 KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4653
4654 return (inodedep);
4655 }
4656
4657 /*
4658 * Called prior to creating a new inode and linking it to a directory. The
4659 * jaddref structure must already be allocated by softdep_setup_inomapdep
4660 * and it is discovered here so we can initialize the mode and update
4661 * nlinkdelta.
4662 */
4663 void
4664 softdep_setup_create(dp, ip)
4665 struct inode *dp;
4666 struct inode *ip;
4667 {
4668 struct inodedep *inodedep;
4669 struct jaddref *jaddref;
4670 struct vnode *dvp;
4671
4672 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4673 ("softdep_setup_create called on non-softdep filesystem"));
4674 KASSERT(ip->i_nlink == 1,
4675 ("softdep_setup_create: Invalid link count."));
4676 dvp = ITOV(dp);
4677 ACQUIRE_LOCK(ITOUMP(dp));
4678 inodedep = inodedep_lookup_ip(ip);
4679 if (DOINGSUJ(dvp)) {
4680 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4681 inoreflst);
4682 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4683 ("softdep_setup_create: No addref structure present."));
4684 }
4685 softdep_prelink(dvp, NULL);
4686 FREE_LOCK(ITOUMP(dp));
4687 }
4688
4689 /*
4690 * Create a jaddref structure to track the addition of a DOTDOT link when
4691 * we are reparenting an inode as part of a rename. This jaddref will be
4692 * found by softdep_setup_directory_change. Adjusts nlinkdelta for
4693 * non-journaling softdep.
4694 */
4695 void
4696 softdep_setup_dotdot_link(dp, ip)
4697 struct inode *dp;
4698 struct inode *ip;
4699 {
4700 struct inodedep *inodedep;
4701 struct jaddref *jaddref;
4702 struct vnode *dvp;
4703
4704 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4705 ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4706 dvp = ITOV(dp);
4707 jaddref = NULL;
4708 /*
4709 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4710 * is used as a normal link would be.
4711 */
4712 if (DOINGSUJ(dvp))
4713 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4714 dp->i_effnlink - 1, dp->i_mode);
4715 ACQUIRE_LOCK(ITOUMP(dp));
4716 inodedep = inodedep_lookup_ip(dp);
4717 if (jaddref)
4718 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4719 if_deps);
4720 softdep_prelink(dvp, ITOV(ip));
4721 FREE_LOCK(ITOUMP(dp));
4722 }
4723
4724 /*
4725 * Create a jaddref structure to track a new link to an inode. The directory
4726 * offset is not known until softdep_setup_directory_add or
4727 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling
4728 * softdep.
4729 */
4730 void
4731 softdep_setup_link(dp, ip)
4732 struct inode *dp;
4733 struct inode *ip;
4734 {
4735 struct inodedep *inodedep;
4736 struct jaddref *jaddref;
4737 struct vnode *dvp;
4738
4739 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4740 ("softdep_setup_link called on non-softdep filesystem"));
4741 dvp = ITOV(dp);
4742 jaddref = NULL;
4743 if (DOINGSUJ(dvp))
4744 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4745 ip->i_mode);
4746 ACQUIRE_LOCK(ITOUMP(dp));
4747 inodedep = inodedep_lookup_ip(ip);
4748 if (jaddref)
4749 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4750 if_deps);
4751 softdep_prelink(dvp, ITOV(ip));
4752 FREE_LOCK(ITOUMP(dp));
4753 }
4754
4755 /*
4756 * Called to create the jaddref structures to track . and .. references as
4757 * well as lookup and further initialize the incomplete jaddref created
4758 * by softdep_setup_inomapdep when the inode was allocated. Adjusts
4759 * nlinkdelta for non-journaling softdep.
4760 */
4761 void
4762 softdep_setup_mkdir(dp, ip)
4763 struct inode *dp;
4764 struct inode *ip;
4765 {
4766 struct inodedep *inodedep;
4767 struct jaddref *dotdotaddref;
4768 struct jaddref *dotaddref;
4769 struct jaddref *jaddref;
4770 struct vnode *dvp;
4771
4772 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4773 ("softdep_setup_mkdir called on non-softdep filesystem"));
4774 dvp = ITOV(dp);
4775 dotaddref = dotdotaddref = NULL;
4776 if (DOINGSUJ(dvp)) {
4777 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4778 ip->i_mode);
4779 dotaddref->ja_state |= MKDIR_BODY;
4780 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4781 dp->i_effnlink - 1, dp->i_mode);
4782 dotdotaddref->ja_state |= MKDIR_PARENT;
4783 }
4784 ACQUIRE_LOCK(ITOUMP(dp));
4785 inodedep = inodedep_lookup_ip(ip);
4786 if (DOINGSUJ(dvp)) {
4787 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4788 inoreflst);
4789 KASSERT(jaddref != NULL,
4790 ("softdep_setup_mkdir: No addref structure present."));
4791 KASSERT(jaddref->ja_parent == dp->i_number,
4792 ("softdep_setup_mkdir: bad parent %ju",
4793 (uintmax_t)jaddref->ja_parent));
4794 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4795 if_deps);
4796 }
4797 inodedep = inodedep_lookup_ip(dp);
4798 if (DOINGSUJ(dvp))
4799 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4800 &dotdotaddref->ja_ref, if_deps);
4801 softdep_prelink(ITOV(dp), NULL);
4802 FREE_LOCK(ITOUMP(dp));
4803 }
4804
4805 /*
4806 * Called to track nlinkdelta of the inode and parent directories prior to
4807 * unlinking a directory.
4808 */
4809 void
4810 softdep_setup_rmdir(dp, ip)
4811 struct inode *dp;
4812 struct inode *ip;
4813 {
4814 struct vnode *dvp;
4815
4816 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4817 ("softdep_setup_rmdir called on non-softdep filesystem"));
4818 dvp = ITOV(dp);
4819 ACQUIRE_LOCK(ITOUMP(dp));
4820 (void) inodedep_lookup_ip(ip);
4821 (void) inodedep_lookup_ip(dp);
4822 softdep_prelink(dvp, ITOV(ip));
4823 FREE_LOCK(ITOUMP(dp));
4824 }
4825
4826 /*
4827 * Called to track nlinkdelta of the inode and parent directories prior to
4828 * unlink.
4829 */
4830 void
4831 softdep_setup_unlink(dp, ip)
4832 struct inode *dp;
4833 struct inode *ip;
4834 {
4835 struct vnode *dvp;
4836
4837 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4838 ("softdep_setup_unlink called on non-softdep filesystem"));
4839 dvp = ITOV(dp);
4840 ACQUIRE_LOCK(ITOUMP(dp));
4841 (void) inodedep_lookup_ip(ip);
4842 (void) inodedep_lookup_ip(dp);
4843 softdep_prelink(dvp, ITOV(ip));
4844 FREE_LOCK(ITOUMP(dp));
4845 }
4846
4847 /*
4848 * Called to release the journal structures created by a failed non-directory
4849 * creation. Adjusts nlinkdelta for non-journaling softdep.
4850 */
4851 void
4852 softdep_revert_create(dp, ip)
4853 struct inode *dp;
4854 struct inode *ip;
4855 {
4856 struct inodedep *inodedep;
4857 struct jaddref *jaddref;
4858 struct vnode *dvp;
4859
4860 KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
4861 ("softdep_revert_create called on non-softdep filesystem"));
4862 dvp = ITOV(dp);
4863 ACQUIRE_LOCK(ITOUMP(dp));
4864 inodedep = inodedep_lookup_ip(ip);
4865 if (DOINGSUJ(dvp)) {
4866 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4867 inoreflst);
4868 KASSERT(jaddref->ja_parent == dp->i_number,
4869 ("softdep_revert_create: addref parent mismatch"));
4870 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4871 }
4872 FREE_LOCK(ITOUMP(dp));
4873 }
4874
4875 /*
4876 * Called to release the journal structures created by a failed link
4877 * addition. Adjusts nlinkdelta for non-journaling softdep.
4878 */
4879 void
4880 softdep_revert_link(dp, ip)
4881 struct inode *dp;
4882 struct inode *ip;
4883 {
4884 struct inodedep *inodedep;
4885 struct jaddref *jaddref;
4886 struct vnode *dvp;
4887
4888 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4889 ("softdep_revert_link called on non-softdep filesystem"));
4890 dvp = ITOV(dp);
4891 ACQUIRE_LOCK(ITOUMP(dp));
4892 inodedep = inodedep_lookup_ip(ip);
4893 if (DOINGSUJ(dvp)) {
4894 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4895 inoreflst);
4896 KASSERT(jaddref->ja_parent == dp->i_number,
4897 ("softdep_revert_link: addref parent mismatch"));
4898 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4899 }
4900 FREE_LOCK(ITOUMP(dp));
4901 }
4902
4903 /*
4904 * Called to release the journal structures created by a failed mkdir
4905 * attempt. Adjusts nlinkdelta for non-journaling softdep.
4906 */
4907 void
4908 softdep_revert_mkdir(dp, ip)
4909 struct inode *dp;
4910 struct inode *ip;
4911 {
4912 struct inodedep *inodedep;
4913 struct jaddref *jaddref;
4914 struct jaddref *dotaddref;
4915 struct vnode *dvp;
4916
4917 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4918 ("softdep_revert_mkdir called on non-softdep filesystem"));
4919 dvp = ITOV(dp);
4920
4921 ACQUIRE_LOCK(ITOUMP(dp));
4922 inodedep = inodedep_lookup_ip(dp);
4923 if (DOINGSUJ(dvp)) {
4924 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4925 inoreflst);
4926 KASSERT(jaddref->ja_parent == ip->i_number,
4927 ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4928 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4929 }
4930 inodedep = inodedep_lookup_ip(ip);
4931 if (DOINGSUJ(dvp)) {
4932 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4933 inoreflst);
4934 KASSERT(jaddref->ja_parent == dp->i_number,
4935 ("softdep_revert_mkdir: addref parent mismatch"));
4936 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4937 inoreflst, if_deps);
4938 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4939 KASSERT(dotaddref->ja_parent == ip->i_number,
4940 ("softdep_revert_mkdir: dot addref parent mismatch"));
4941 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4942 }
4943 FREE_LOCK(ITOUMP(dp));
4944 }
4945
4946 /*
4947 * Called to correct nlinkdelta after a failed rmdir.
4948 */
4949 void
4950 softdep_revert_rmdir(dp, ip)
4951 struct inode *dp;
4952 struct inode *ip;
4953 {
4954
4955 KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
4956 ("softdep_revert_rmdir called on non-softdep filesystem"));
4957 ACQUIRE_LOCK(ITOUMP(dp));
4958 (void) inodedep_lookup_ip(ip);
4959 (void) inodedep_lookup_ip(dp);
4960 FREE_LOCK(ITOUMP(dp));
4961 }
4962
4963 /*
4964 * Protecting the freemaps (or bitmaps).
4965 *
4966 * To eliminate the need to execute fsck before mounting a filesystem
4967 * after a power failure, one must (conservatively) guarantee that the
4968 * on-disk copy of the bitmaps never indicate that a live inode or block is
4969 * free. So, when a block or inode is allocated, the bitmap should be
4970 * updated (on disk) before any new pointers. When a block or inode is
4971 * freed, the bitmap should not be updated until all pointers have been
4972 * reset. The latter dependency is handled by the delayed de-allocation
4973 * approach described below for block and inode de-allocation. The former
4974 * dependency is handled by calling the following procedure when a block or
4975 * inode is allocated. When an inode is allocated an "inodedep" is created
4976 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4977 * Each "inodedep" is also inserted into the hash indexing structure so
4978 * that any additional link additions can be made dependent on the inode
4979 * allocation.
4980 *
4981 * The ufs filesystem maintains a number of free block counts (e.g., per
4982 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4983 * in addition to the bitmaps. These counts are used to improve efficiency
4984 * during allocation and therefore must be consistent with the bitmaps.
4985 * There is no convenient way to guarantee post-crash consistency of these
4986 * counts with simple update ordering, for two main reasons: (1) The counts
4987 * and bitmaps for a single cylinder group block are not in the same disk
4988 * sector. If a disk write is interrupted (e.g., by power failure), one may
4989 * be written and the other not. (2) Some of the counts are located in the
4990 * superblock rather than the cylinder group block. So, we focus our soft
4991 * updates implementation on protecting the bitmaps. When mounting a
4992 * filesystem, we recompute the auxiliary counts from the bitmaps.
4993 */
4994
4995 /*
4996 * Called just after updating the cylinder group block to allocate an inode.
4997 */
4998 void
4999 softdep_setup_inomapdep(bp, ip, newinum, mode)
5000 struct buf *bp; /* buffer for cylgroup block with inode map */
5001 struct inode *ip; /* inode related to allocation */
5002 ino_t newinum; /* new inode number being allocated */
5003 int mode;
5004 {
5005 struct inodedep *inodedep;
5006 struct bmsafemap *bmsafemap;
5007 struct jaddref *jaddref;
5008 struct mount *mp;
5009 struct fs *fs;
5010
5011 mp = ITOVFS(ip);
5012 KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5013 ("softdep_setup_inomapdep called on non-softdep filesystem"));
5014 fs = VFSTOUFS(mp)->um_fs;
5015 jaddref = NULL;
5016
5017 /*
5018 * Allocate the journal reference add structure so that the bitmap
5019 * can be dependent on it.
5020 */
5021 if (MOUNTEDSUJ(mp)) {
5022 jaddref = newjaddref(ip, newinum, 0, 0, mode);
5023 jaddref->ja_state |= NEWBLOCK;
5024 }
5025
5026 /*
5027 * Create a dependency for the newly allocated inode.
5028 * Panic if it already exists as something is seriously wrong.
5029 * Otherwise add it to the dependency list for the buffer holding
5030 * the cylinder group map from which it was allocated.
5031 *
5032 * We have to preallocate a bmsafemap entry in case it is needed
5033 * in bmsafemap_lookup since once we allocate the inodedep, we
5034 * have to finish initializing it before we can FREE_LOCK().
5035 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5036 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5037 * creating the inodedep as it can be freed during the time
5038 * that we FREE_LOCK() while allocating the inodedep. We must
5039 * call workitem_alloc() before entering the locked section as
5040 * it also acquires the lock and we must avoid trying doing so
5041 * recursively.
5042 */
5043 bmsafemap = malloc(sizeof(struct bmsafemap),
5044 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5045 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5046 ACQUIRE_LOCK(ITOUMP(ip));
5047 if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5048 panic("softdep_setup_inomapdep: dependency %p for new"
5049 "inode already exists", inodedep);
5050 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5051 if (jaddref) {
5052 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5053 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5054 if_deps);
5055 } else {
5056 inodedep->id_state |= ONDEPLIST;
5057 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5058 }
5059 inodedep->id_bmsafemap = bmsafemap;
5060 inodedep->id_state &= ~DEPCOMPLETE;
5061 FREE_LOCK(ITOUMP(ip));
5062 }
5063
5064 /*
5065 * Called just after updating the cylinder group block to
5066 * allocate block or fragment.
5067 */
5068 void
5069 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5070 struct buf *bp; /* buffer for cylgroup block with block map */
5071 struct mount *mp; /* filesystem doing allocation */
5072 ufs2_daddr_t newblkno; /* number of newly allocated block */
5073 int frags; /* Number of fragments. */
5074 int oldfrags; /* Previous number of fragments for extend. */
5075 {
5076 struct newblk *newblk;
5077 struct bmsafemap *bmsafemap;
5078 struct jnewblk *jnewblk;
5079 struct ufsmount *ump;
5080 struct fs *fs;
5081
5082 KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5083 ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5084 ump = VFSTOUFS(mp);
5085 fs = ump->um_fs;
5086 jnewblk = NULL;
5087 /*
5088 * Create a dependency for the newly allocated block.
5089 * Add it to the dependency list for the buffer holding
5090 * the cylinder group map from which it was allocated.
5091 */
5092 if (MOUNTEDSUJ(mp)) {
5093 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5094 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5095 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5096 jnewblk->jn_state = ATTACHED;
5097 jnewblk->jn_blkno = newblkno;
5098 jnewblk->jn_frags = frags;
5099 jnewblk->jn_oldfrags = oldfrags;
5100 #ifdef SUJ_DEBUG
5101 {
5102 struct cg *cgp;
5103 uint8_t *blksfree;
5104 long bno;
5105 int i;
5106
5107 cgp = (struct cg *)bp->b_data;
5108 blksfree = cg_blksfree(cgp);
5109 bno = dtogd(fs, jnewblk->jn_blkno);
5110 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5111 i++) {
5112 if (isset(blksfree, bno + i))
5113 panic("softdep_setup_blkmapdep: "
5114 "free fragment %d from %d-%d "
5115 "state 0x%X dep %p", i,
5116 jnewblk->jn_oldfrags,
5117 jnewblk->jn_frags,
5118 jnewblk->jn_state,
5119 jnewblk->jn_dep);
5120 }
5121 }
5122 #endif
5123 }
5124
5125 CTR3(KTR_SUJ,
5126 "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5127 newblkno, frags, oldfrags);
5128 ACQUIRE_LOCK(ump);
5129 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5130 panic("softdep_setup_blkmapdep: found block");
5131 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5132 dtog(fs, newblkno), NULL);
5133 if (jnewblk) {
5134 jnewblk->jn_dep = (struct worklist *)newblk;
5135 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5136 } else {
5137 newblk->nb_state |= ONDEPLIST;
5138 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5139 }
5140 newblk->nb_bmsafemap = bmsafemap;
5141 newblk->nb_jnewblk = jnewblk;
5142 FREE_LOCK(ump);
5143 }
5144
5145 #define BMSAFEMAP_HASH(ump, cg) \
5146 (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5147
5148 static int
5149 bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5150 struct bmsafemap_hashhead *bmsafemaphd;
5151 int cg;
5152 struct bmsafemap **bmsafemapp;
5153 {
5154 struct bmsafemap *bmsafemap;
5155
5156 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5157 if (bmsafemap->sm_cg == cg)
5158 break;
5159 if (bmsafemap) {
5160 *bmsafemapp = bmsafemap;
5161 return (1);
5162 }
5163 *bmsafemapp = NULL;
5164
5165 return (0);
5166 }
5167
5168 /*
5169 * Find the bmsafemap associated with a cylinder group buffer.
5170 * If none exists, create one. The buffer must be locked when
5171 * this routine is called and this routine must be called with
5172 * the softdep lock held. To avoid giving up the lock while
5173 * allocating a new bmsafemap, a preallocated bmsafemap may be
5174 * provided. If it is provided but not needed, it is freed.
5175 */
5176 static struct bmsafemap *
5177 bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5178 struct mount *mp;
5179 struct buf *bp;
5180 int cg;
5181 struct bmsafemap *newbmsafemap;
5182 {
5183 struct bmsafemap_hashhead *bmsafemaphd;
5184 struct bmsafemap *bmsafemap, *collision;
5185 struct worklist *wk;
5186 struct ufsmount *ump;
5187
5188 ump = VFSTOUFS(mp);
5189 LOCK_OWNED(ump);
5190 KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5191 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5192 if (wk->wk_type == D_BMSAFEMAP) {
5193 if (newbmsafemap)
5194 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5195 return (WK_BMSAFEMAP(wk));
5196 }
5197 }
5198 bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5199 if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5200 if (newbmsafemap)
5201 WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5202 return (bmsafemap);
5203 }
5204 if (newbmsafemap) {
5205 bmsafemap = newbmsafemap;
5206 } else {
5207 FREE_LOCK(ump);
5208 bmsafemap = malloc(sizeof(struct bmsafemap),
5209 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5210 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5211 ACQUIRE_LOCK(ump);
5212 }
5213 bmsafemap->sm_buf = bp;
5214 LIST_INIT(&bmsafemap->sm_inodedephd);
5215 LIST_INIT(&bmsafemap->sm_inodedepwr);
5216 LIST_INIT(& |