1 /*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * The soft updates code is derived from the appendix of a University
5 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
6 * "Soft Updates: A Solution to the Metadata Update Problem in File
7 * Systems", CSE-TR-254-95, August 1995).
8 *
9 * Further information about soft updates can be obtained from:
10 *
11 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
12 * 1614 Oxford Street mckusick@mckusick.com
13 * Berkeley, CA 94709-1608 +1-510-843-9542
14 * USA
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
27 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
28 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
30 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 /*
45 * For now we want the safety net that the DEBUG flag provides.
46 */
47 #ifndef DEBUG
48 #define DEBUG
49 #endif
50
51 #include <sys/param.h>
52 #include <sys/kernel.h>
53 #include <sys/systm.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/kdb.h>
57 #include <sys/kthread.h>
58 #include <sys/lock.h>
59 #include <sys/malloc.h>
60 #include <sys/mount.h>
61 #include <sys/mutex.h>
62 #include <sys/proc.h>
63 #include <sys/stat.h>
64 #include <sys/sysctl.h>
65 #include <sys/syslog.h>
66 #include <sys/vnode.h>
67 #include <sys/conf.h>
68 #include <ufs/ufs/dir.h>
69 #include <ufs/ufs/extattr.h>
70 #include <ufs/ufs/quota.h>
71 #include <ufs/ufs/inode.h>
72 #include <ufs/ufs/ufsmount.h>
73 #include <ufs/ffs/fs.h>
74 #include <ufs/ffs/softdep.h>
75 #include <ufs/ffs/ffs_extern.h>
76 #include <ufs/ufs/ufs_extern.h>
77
78 #include <vm/vm.h>
79
80 #include "opt_ffs.h"
81 #include "opt_quota.h"
82
83 #ifndef SOFTUPDATES
84
85 int
86 softdep_flushfiles(oldmnt, flags, td)
87 struct mount *oldmnt;
88 int flags;
89 struct thread *td;
90 {
91
92 panic("softdep_flushfiles called");
93 }
94
95 int
96 softdep_mount(devvp, mp, fs, cred)
97 struct vnode *devvp;
98 struct mount *mp;
99 struct fs *fs;
100 struct ucred *cred;
101 {
102
103 return (0);
104 }
105
106 void
107 softdep_initialize()
108 {
109
110 return;
111 }
112
113 void
114 softdep_uninitialize()
115 {
116
117 return;
118 }
119
120 void
121 softdep_setup_inomapdep(bp, ip, newinum)
122 struct buf *bp;
123 struct inode *ip;
124 ino_t newinum;
125 {
126
127 panic("softdep_setup_inomapdep called");
128 }
129
130 void
131 softdep_setup_blkmapdep(bp, mp, newblkno)
132 struct buf *bp;
133 struct mount *mp;
134 ufs2_daddr_t newblkno;
135 {
136
137 panic("softdep_setup_blkmapdep called");
138 }
139
140 void
141 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
142 struct inode *ip;
143 ufs_lbn_t lbn;
144 ufs2_daddr_t newblkno;
145 ufs2_daddr_t oldblkno;
146 long newsize;
147 long oldsize;
148 struct buf *bp;
149 {
150
151 panic("softdep_setup_allocdirect called");
152 }
153
154 void
155 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
156 struct inode *ip;
157 ufs_lbn_t lbn;
158 ufs2_daddr_t newblkno;
159 ufs2_daddr_t oldblkno;
160 long newsize;
161 long oldsize;
162 struct buf *bp;
163 {
164
165 panic("softdep_setup_allocext called");
166 }
167
168 void
169 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
170 struct inode *ip;
171 ufs_lbn_t lbn;
172 struct buf *bp;
173 int ptrno;
174 ufs2_daddr_t newblkno;
175 ufs2_daddr_t oldblkno;
176 struct buf *nbp;
177 {
178
179 panic("softdep_setup_allocindir_page called");
180 }
181
182 void
183 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
184 struct buf *nbp;
185 struct inode *ip;
186 struct buf *bp;
187 int ptrno;
188 ufs2_daddr_t newblkno;
189 {
190
191 panic("softdep_setup_allocindir_meta called");
192 }
193
194 void
195 softdep_setup_freeblocks(ip, length, flags)
196 struct inode *ip;
197 off_t length;
198 int flags;
199 {
200
201 panic("softdep_setup_freeblocks called");
202 }
203
204 void
205 softdep_freefile(pvp, ino, mode)
206 struct vnode *pvp;
207 ino_t ino;
208 int mode;
209 {
210
211 panic("softdep_freefile called");
212 }
213
214 int
215 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
216 struct buf *bp;
217 struct inode *dp;
218 off_t diroffset;
219 ino_t newinum;
220 struct buf *newdirbp;
221 int isnewblk;
222 {
223
224 panic("softdep_setup_directory_add called");
225 }
226
227 void
228 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
229 struct inode *dp;
230 caddr_t base;
231 caddr_t oldloc;
232 caddr_t newloc;
233 int entrysize;
234 {
235
236 panic("softdep_change_directoryentry_offset called");
237 }
238
239 void
240 softdep_setup_remove(bp, dp, ip, isrmdir)
241 struct buf *bp;
242 struct inode *dp;
243 struct inode *ip;
244 int isrmdir;
245 {
246
247 panic("softdep_setup_remove called");
248 }
249
250 void
251 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
252 struct buf *bp;
253 struct inode *dp;
254 struct inode *ip;
255 ino_t newinum;
256 int isrmdir;
257 {
258
259 panic("softdep_setup_directory_change called");
260 }
261
262 void
263 softdep_change_linkcnt(ip)
264 struct inode *ip;
265 {
266
267 panic("softdep_change_linkcnt called");
268 }
269
270 void
271 softdep_load_inodeblock(ip)
272 struct inode *ip;
273 {
274
275 panic("softdep_load_inodeblock called");
276 }
277
278 void
279 softdep_update_inodeblock(ip, bp, waitfor)
280 struct inode *ip;
281 struct buf *bp;
282 int waitfor;
283 {
284
285 panic("softdep_update_inodeblock called");
286 }
287
288 int
289 softdep_fsync(vp)
290 struct vnode *vp; /* the "in_core" copy of the inode */
291 {
292
293 return (0);
294 }
295
296 void
297 softdep_fsync_mountdev(vp)
298 struct vnode *vp;
299 {
300
301 return;
302 }
303
304 int
305 softdep_flushworklist(oldmnt, countp, td)
306 struct mount *oldmnt;
307 int *countp;
308 struct thread *td;
309 {
310
311 *countp = 0;
312 return (0);
313 }
314
315 int
316 softdep_sync_metadata(struct vnode *vp)
317 {
318
319 return (0);
320 }
321
322 int
323 softdep_slowdown(vp)
324 struct vnode *vp;
325 {
326
327 panic("softdep_slowdown called");
328 }
329
330 void
331 softdep_releasefile(ip)
332 struct inode *ip; /* inode with the zero effective link count */
333 {
334
335 panic("softdep_releasefile called");
336 }
337
338 int
339 softdep_request_cleanup(fs, vp)
340 struct fs *fs;
341 struct vnode *vp;
342 {
343
344 return (0);
345 }
346
347 int
348 softdep_check_suspend(struct mount *mp,
349 struct vnode *devvp,
350 int softdep_deps,
351 int softdep_accdeps,
352 int secondary_writes,
353 int secondary_accwrites)
354 {
355 struct bufobj *bo;
356 int error;
357
358 (void) softdep_deps,
359 (void) softdep_accdeps;
360
361 ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
362 bo = &devvp->v_bufobj;
363
364 for (;;) {
365 if (!MNT_ITRYLOCK(mp)) {
366 VI_UNLOCK(devvp);
367 MNT_ILOCK(mp);
368 MNT_IUNLOCK(mp);
369 VI_LOCK(devvp);
370 continue;
371 }
372 if (mp->mnt_secondary_writes != 0) {
373 VI_UNLOCK(devvp);
374 msleep(&mp->mnt_secondary_writes,
375 MNT_MTX(mp),
376 (PUSER - 1) | PDROP, "secwr", 0);
377 VI_LOCK(devvp);
378 continue;
379 }
380 break;
381 }
382
383 /*
384 * Reasons for needing more work before suspend:
385 * - Dirty buffers on devvp.
386 * - Secondary writes occurred after start of vnode sync loop
387 */
388 error = 0;
389 if (bo->bo_numoutput > 0 ||
390 bo->bo_dirty.bv_cnt > 0 ||
391 secondary_writes != 0 ||
392 mp->mnt_secondary_writes != 0 ||
393 secondary_accwrites != mp->mnt_secondary_accwrites)
394 error = EAGAIN;
395 VI_UNLOCK(devvp);
396 return (error);
397 }
398
399 void
400 softdep_get_depcounts(struct mount *mp,
401 int *softdepactivep,
402 int *softdepactiveaccp)
403 {
404 (void) mp;
405 *softdepactivep = 0;
406 *softdepactiveaccp = 0;
407 }
408
409 #else
410 /*
411 * These definitions need to be adapted to the system to which
412 * this file is being ported.
413 */
414 /*
415 * malloc types defined for the softdep system.
416 */
417 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
418 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
419 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
420 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
421 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
422 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
423 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
424 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
425 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
426 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
427 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
428 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
429 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
430 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
431 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
432
433 #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE)
434
435 #define D_PAGEDEP 0
436 #define D_INODEDEP 1
437 #define D_NEWBLK 2
438 #define D_BMSAFEMAP 3
439 #define D_ALLOCDIRECT 4
440 #define D_INDIRDEP 5
441 #define D_ALLOCINDIR 6
442 #define D_FREEFRAG 7
443 #define D_FREEBLKS 8
444 #define D_FREEFILE 9
445 #define D_DIRADD 10
446 #define D_MKDIR 11
447 #define D_DIRREM 12
448 #define D_NEWDIRBLK 13
449 #define D_LAST D_NEWDIRBLK
450
451 /*
452 * translate from workitem type to memory type
453 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
454 */
455 static struct malloc_type *memtype[] = {
456 M_PAGEDEP,
457 M_INODEDEP,
458 M_NEWBLK,
459 M_BMSAFEMAP,
460 M_ALLOCDIRECT,
461 M_INDIRDEP,
462 M_ALLOCINDIR,
463 M_FREEFRAG,
464 M_FREEBLKS,
465 M_FREEFILE,
466 M_DIRADD,
467 M_MKDIR,
468 M_DIRREM,
469 M_NEWDIRBLK
470 };
471
472 #define DtoM(type) (memtype[type])
473
474 /*
475 * Names of malloc types.
476 */
477 #define TYPENAME(type) \
478 ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
479 /*
480 * End system adaptation definitions.
481 */
482
483 /*
484 * Forward declarations.
485 */
486 struct inodedep_hashhead;
487 struct newblk_hashhead;
488 struct pagedep_hashhead;
489
490 /*
491 * Internal function prototypes.
492 */
493 static void softdep_error(char *, int);
494 static void drain_output(struct vnode *);
495 static struct buf *getdirtybuf(struct buf *, struct mtx *, int);
496 static void clear_remove(struct thread *);
497 static void clear_inodedeps(struct thread *);
498 static int flush_pagedep_deps(struct vnode *, struct mount *,
499 struct diraddhd *);
500 static int flush_inodedep_deps(struct mount *, ino_t);
501 static int flush_deplist(struct allocdirectlst *, int, int *);
502 static int handle_written_filepage(struct pagedep *, struct buf *);
503 static void diradd_inode_written(struct diradd *, struct inodedep *);
504 static int handle_written_inodeblock(struct inodedep *, struct buf *);
505 static void handle_allocdirect_partdone(struct allocdirect *);
506 static void handle_allocindir_partdone(struct allocindir *);
507 static void initiate_write_filepage(struct pagedep *, struct buf *);
508 static void handle_written_mkdir(struct mkdir *, int);
509 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
510 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
511 static void handle_workitem_freefile(struct freefile *);
512 static void handle_workitem_remove(struct dirrem *, struct vnode *);
513 static struct dirrem *newdirrem(struct buf *, struct inode *,
514 struct inode *, int, struct dirrem **);
515 static void free_diradd(struct diradd *);
516 static void free_allocindir(struct allocindir *, struct inodedep *);
517 static void free_newdirblk(struct newdirblk *);
518 static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
519 ufs2_daddr_t *);
520 static void deallocate_dependencies(struct buf *, struct inodedep *);
521 static void free_allocdirect(struct allocdirectlst *,
522 struct allocdirect *, int);
523 static int check_inode_unwritten(struct inodedep *);
524 static int free_inodedep(struct inodedep *);
525 static void handle_workitem_freeblocks(struct freeblks *, int);
526 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
527 static void setup_allocindir_phase2(struct buf *, struct inode *,
528 struct allocindir *);
529 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
530 ufs2_daddr_t);
531 static void handle_workitem_freefrag(struct freefrag *);
532 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
533 static void allocdirect_merge(struct allocdirectlst *,
534 struct allocdirect *, struct allocdirect *);
535 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
536 static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
537 struct newblk **);
538 static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
539 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
540 struct inodedep **);
541 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
542 static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
543 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
544 struct mount *mp, int, struct pagedep **);
545 static void pause_timer(void *);
546 static int request_cleanup(struct mount *, int);
547 static int process_worklist_item(struct mount *, int);
548 static void add_to_worklist(struct worklist *);
549 static void softdep_flush(void);
550 static int softdep_speedup(void);
551
552 /*
553 * Exported softdep operations.
554 */
555 static void softdep_disk_io_initiation(struct buf *);
556 static void softdep_disk_write_complete(struct buf *);
557 static void softdep_deallocate_dependencies(struct buf *);
558 static int softdep_count_dependencies(struct buf *bp, int);
559
560 static struct mtx lk;
561 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
562
563 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk)
564 #define ACQUIRE_LOCK(lk) mtx_lock(lk)
565 #define FREE_LOCK(lk) mtx_unlock(lk)
566
567 /*
568 * Worklist queue management.
569 * These routines require that the lock be held.
570 */
571 #ifndef /* NOT */ DEBUG
572 #define WORKLIST_INSERT(head, item) do { \
573 (item)->wk_state |= ONWORKLIST; \
574 LIST_INSERT_HEAD(head, item, wk_list); \
575 } while (0)
576 #define WORKLIST_REMOVE(item) do { \
577 (item)->wk_state &= ~ONWORKLIST; \
578 LIST_REMOVE(item, wk_list); \
579 } while (0)
580 #else /* DEBUG */
581 static void worklist_insert(struct workhead *, struct worklist *);
582 static void worklist_remove(struct worklist *);
583
584 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
585 #define WORKLIST_REMOVE(item) worklist_remove(item)
586
587 static void
588 worklist_insert(head, item)
589 struct workhead *head;
590 struct worklist *item;
591 {
592
593 mtx_assert(&lk, MA_OWNED);
594 if (item->wk_state & ONWORKLIST)
595 panic("worklist_insert: already on list");
596 item->wk_state |= ONWORKLIST;
597 LIST_INSERT_HEAD(head, item, wk_list);
598 }
599
600 static void
601 worklist_remove(item)
602 struct worklist *item;
603 {
604
605 mtx_assert(&lk, MA_OWNED);
606 if ((item->wk_state & ONWORKLIST) == 0)
607 panic("worklist_remove: not on list");
608 item->wk_state &= ~ONWORKLIST;
609 LIST_REMOVE(item, wk_list);
610 }
611 #endif /* DEBUG */
612
613 /*
614 * Routines for tracking and managing workitems.
615 */
616 static void workitem_free(struct worklist *, int);
617 static void workitem_alloc(struct worklist *, int, struct mount *);
618
619 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
620
621 static void
622 workitem_free(item, type)
623 struct worklist *item;
624 int type;
625 {
626 struct ufsmount *ump;
627 mtx_assert(&lk, MA_OWNED);
628
629 #ifdef DEBUG
630 if (item->wk_state & ONWORKLIST)
631 panic("workitem_free: still on list");
632 if (item->wk_type != type)
633 panic("workitem_free: type mismatch");
634 #endif
635 ump = VFSTOUFS(item->wk_mp);
636 if (--ump->softdep_deps == 0 && ump->softdep_req)
637 wakeup(&ump->softdep_deps);
638 FREE(item, DtoM(type));
639 }
640
641 static void
642 workitem_alloc(item, type, mp)
643 struct worklist *item;
644 int type;
645 struct mount *mp;
646 {
647 item->wk_type = type;
648 item->wk_mp = mp;
649 item->wk_state = 0;
650 ACQUIRE_LOCK(&lk);
651 VFSTOUFS(mp)->softdep_deps++;
652 VFSTOUFS(mp)->softdep_accdeps++;
653 FREE_LOCK(&lk);
654 }
655
656 /*
657 * Workitem queue management
658 */
659 static int max_softdeps; /* maximum number of structs before slowdown */
660 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */
661 static int tickdelay = 2; /* number of ticks to pause during slowdown */
662 static int proc_waiting; /* tracks whether we have a timeout posted */
663 static int *stat_countp; /* statistic to count in proc_waiting timeout */
664 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
665 static int req_pending;
666 static int req_clear_inodedeps; /* syncer process flush some inodedeps */
667 #define FLUSH_INODES 1
668 static int req_clear_remove; /* syncer process flush some freeblks */
669 #define FLUSH_REMOVE 2
670 #define FLUSH_REMOVE_WAIT 3
671 static long num_freeblkdep; /* number of freeblks workitems allocated */
672
673 /*
674 * runtime statistics
675 */
676 static int stat_worklist_push; /* number of worklist cleanups */
677 static int stat_blk_limit_push; /* number of times block limit neared */
678 static int stat_ino_limit_push; /* number of times inode limit neared */
679 static int stat_blk_limit_hit; /* number of times block slowdown imposed */
680 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
681 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
682 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
683 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
684 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
685 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
686
687 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
688 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
689 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
690 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
691 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
692 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
693 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
694 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
695 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
696 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
697 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
698 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
699 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
700 /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
701
702 SYSCTL_DECL(_vfs_ffs);
703
704 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */
705 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
706 &compute_summary_at_mount, 0, "Recompute summary at mount");
707
708 static struct proc *softdepproc;
709 static struct kproc_desc softdep_kp = {
710 "softdepflush",
711 softdep_flush,
712 &softdepproc
713 };
714 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
715
716 static void
717 softdep_flush(void)
718 {
719 struct mount *nmp;
720 struct mount *mp;
721 struct ufsmount *ump;
722 struct thread *td;
723 int remaining;
724 int vfslocked;
725
726 td = curthread;
727 td->td_pflags |= TDP_NORUNNINGBUF;
728
729 for (;;) {
730 kthread_suspend_check(softdepproc);
731 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL);
732 ACQUIRE_LOCK(&lk);
733 /*
734 * If requested, try removing inode or removal dependencies.
735 */
736 if (req_clear_inodedeps) {
737 clear_inodedeps(td);
738 req_clear_inodedeps -= 1;
739 wakeup_one(&proc_waiting);
740 }
741 if (req_clear_remove) {
742 clear_remove(td);
743 req_clear_remove -= 1;
744 wakeup_one(&proc_waiting);
745 }
746 FREE_LOCK(&lk);
747 VFS_UNLOCK_GIANT(vfslocked);
748 remaining = 0;
749 mtx_lock(&mountlist_mtx);
750 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
751 nmp = TAILQ_NEXT(mp, mnt_list);
752 if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
753 continue;
754 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
755 continue;
756 vfslocked = VFS_LOCK_GIANT(mp);
757 softdep_process_worklist(mp, 0);
758 ump = VFSTOUFS(mp);
759 remaining += ump->softdep_on_worklist -
760 ump->softdep_on_worklist_inprogress;
761 VFS_UNLOCK_GIANT(vfslocked);
762 mtx_lock(&mountlist_mtx);
763 nmp = TAILQ_NEXT(mp, mnt_list);
764 vfs_unbusy(mp, td);
765 }
766 mtx_unlock(&mountlist_mtx);
767 if (remaining)
768 continue;
769 ACQUIRE_LOCK(&lk);
770 if (!req_pending)
771 msleep(&req_pending, &lk, PVM, "sdflush", hz);
772 req_pending = 0;
773 FREE_LOCK(&lk);
774 }
775 }
776
777 static int
778 softdep_speedup(void)
779 {
780
781 mtx_assert(&lk, MA_OWNED);
782 if (req_pending == 0) {
783 req_pending = 1;
784 wakeup(&req_pending);
785 }
786
787 return speedup_syncer();
788 }
789
790 /*
791 * Add an item to the end of the work queue.
792 * This routine requires that the lock be held.
793 * This is the only routine that adds items to the list.
794 * The following routine is the only one that removes items
795 * and does so in order from first to last.
796 */
797 static void
798 add_to_worklist(wk)
799 struct worklist *wk;
800 {
801 struct ufsmount *ump;
802
803 mtx_assert(&lk, MA_OWNED);
804 ump = VFSTOUFS(wk->wk_mp);
805 if (wk->wk_state & ONWORKLIST)
806 panic("add_to_worklist: already on list");
807 wk->wk_state |= ONWORKLIST;
808 if (LIST_EMPTY(&ump->softdep_workitem_pending))
809 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
810 else
811 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
812 ump->softdep_worklist_tail = wk;
813 ump->softdep_on_worklist += 1;
814 }
815
816 /*
817 * Process that runs once per second to handle items in the background queue.
818 *
819 * Note that we ensure that everything is done in the order in which they
820 * appear in the queue. The code below depends on this property to ensure
821 * that blocks of a file are freed before the inode itself is freed. This
822 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
823 * until all the old ones have been purged from the dependency lists.
824 */
825 int
826 softdep_process_worklist(mp, full)
827 struct mount *mp;
828 int full;
829 {
830 struct thread *td = curthread;
831 int cnt, matchcnt, loopcount;
832 struct ufsmount *ump;
833 long starttime;
834
835 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
836 /*
837 * Record the process identifier of our caller so that we can give
838 * this process preferential treatment in request_cleanup below.
839 */
840 matchcnt = 0;
841 ump = VFSTOUFS(mp);
842 ACQUIRE_LOCK(&lk);
843 loopcount = 1;
844 starttime = time_second;
845 while (ump->softdep_on_worklist > 0) {
846 if ((cnt = process_worklist_item(mp, 0)) == -1)
847 break;
848 else
849 matchcnt += cnt;
850 /*
851 * If requested, try removing inode or removal dependencies.
852 */
853 if (req_clear_inodedeps) {
854 clear_inodedeps(td);
855 req_clear_inodedeps -= 1;
856 wakeup_one(&proc_waiting);
857 }
858 if (req_clear_remove) {
859 clear_remove(td);
860 req_clear_remove -= 1;
861 wakeup_one(&proc_waiting);
862 }
863 /*
864 * We do not generally want to stop for buffer space, but if
865 * we are really being a buffer hog, we will stop and wait.
866 */
867 if (loopcount++ % 128 == 0) {
868 FREE_LOCK(&lk);
869 bwillwrite();
870 ACQUIRE_LOCK(&lk);
871 }
872 /*
873 * Never allow processing to run for more than one
874 * second. Otherwise the other mountpoints may get
875 * excessively backlogged.
876 */
877 if (!full && starttime != time_second) {
878 matchcnt = -1;
879 break;
880 }
881 }
882 FREE_LOCK(&lk);
883 return (matchcnt);
884 }
885
886 /*
887 * Process one item on the worklist.
888 */
889 static int
890 process_worklist_item(mp, flags)
891 struct mount *mp;
892 int flags;
893 {
894 struct worklist *wk, *wkend;
895 struct ufsmount *ump;
896 struct vnode *vp;
897 int matchcnt = 0;
898
899 mtx_assert(&lk, MA_OWNED);
900 KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
901 /*
902 * If we are being called because of a process doing a
903 * copy-on-write, then it is not safe to write as we may
904 * recurse into the copy-on-write routine.
905 */
906 if (curthread->td_pflags & TDP_COWINPROGRESS)
907 return (-1);
908 /*
909 * Normally we just process each item on the worklist in order.
910 * However, if we are in a situation where we cannot lock any
911 * inodes, we have to skip over any dirrem requests whose
912 * vnodes are resident and locked.
913 */
914 ump = VFSTOUFS(mp);
915 vp = NULL;
916 LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
917 if (wk->wk_state & INPROGRESS)
918 continue;
919 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
920 break;
921 wk->wk_state |= INPROGRESS;
922 ump->softdep_on_worklist_inprogress++;
923 FREE_LOCK(&lk);
924 ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
925 LK_NOWAIT | LK_EXCLUSIVE, &vp);
926 ACQUIRE_LOCK(&lk);
927 wk->wk_state &= ~INPROGRESS;
928 ump->softdep_on_worklist_inprogress--;
929 if (vp != NULL)
930 break;
931 }
932 if (wk == 0)
933 return (-1);
934 /*
935 * Remove the item to be processed. If we are removing the last
936 * item on the list, we need to recalculate the tail pointer.
937 * As this happens rarely and usually when the list is short,
938 * we just run down the list to find it rather than tracking it
939 * in the above loop.
940 */
941 WORKLIST_REMOVE(wk);
942 if (wk == ump->softdep_worklist_tail) {
943 LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
944 if (LIST_NEXT(wkend, wk_list) == NULL)
945 break;
946 ump->softdep_worklist_tail = wkend;
947 }
948 ump->softdep_on_worklist -= 1;
949 FREE_LOCK(&lk);
950 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
951 panic("process_worklist_item: suspended filesystem");
952 matchcnt++;
953 switch (wk->wk_type) {
954
955 case D_DIRREM:
956 /* removal of a directory entry */
957 handle_workitem_remove(WK_DIRREM(wk), vp);
958 break;
959
960 case D_FREEBLKS:
961 /* releasing blocks and/or fragments from a file */
962 handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
963 break;
964
965 case D_FREEFRAG:
966 /* releasing a fragment when replaced as a file grows */
967 handle_workitem_freefrag(WK_FREEFRAG(wk));
968 break;
969
970 case D_FREEFILE:
971 /* releasing an inode when its link count drops to 0 */
972 handle_workitem_freefile(WK_FREEFILE(wk));
973 break;
974
975 default:
976 panic("%s_process_worklist: Unknown type %s",
977 "softdep", TYPENAME(wk->wk_type));
978 /* NOTREACHED */
979 }
980 vn_finished_secondary_write(mp);
981 ACQUIRE_LOCK(&lk);
982 return (matchcnt);
983 }
984
985 /*
986 * Move dependencies from one buffer to another.
987 */
988 void
989 softdep_move_dependencies(oldbp, newbp)
990 struct buf *oldbp;
991 struct buf *newbp;
992 {
993 struct worklist *wk, *wktail;
994
995 if (!LIST_EMPTY(&newbp->b_dep))
996 panic("softdep_move_dependencies: need merge code");
997 wktail = 0;
998 ACQUIRE_LOCK(&lk);
999 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1000 LIST_REMOVE(wk, wk_list);
1001 if (wktail == 0)
1002 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1003 else
1004 LIST_INSERT_AFTER(wktail, wk, wk_list);
1005 wktail = wk;
1006 }
1007 FREE_LOCK(&lk);
1008 }
1009
1010 /*
1011 * Purge the work list of all items associated with a particular mount point.
1012 */
1013 int
1014 softdep_flushworklist(oldmnt, countp, td)
1015 struct mount *oldmnt;
1016 int *countp;
1017 struct thread *td;
1018 {
1019 struct vnode *devvp;
1020 int count, error = 0;
1021 struct ufsmount *ump;
1022
1023 /*
1024 * Alternately flush the block device associated with the mount
1025 * point and process any dependencies that the flushing
1026 * creates. We continue until no more worklist dependencies
1027 * are found.
1028 */
1029 *countp = 0;
1030 ump = VFSTOUFS(oldmnt);
1031 devvp = ump->um_devvp;
1032 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1033 *countp += count;
1034 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
1035 error = VOP_FSYNC(devvp, MNT_WAIT, td);
1036 VOP_UNLOCK(devvp, 0, td);
1037 if (error)
1038 break;
1039 }
1040 return (error);
1041 }
1042
1043 int
1044 softdep_waitidle(struct mount *mp)
1045 {
1046 struct ufsmount *ump;
1047 int error;
1048 int i;
1049
1050 ump = VFSTOUFS(mp);
1051 ACQUIRE_LOCK(&lk);
1052 for (i = 0; i < 10 && ump->softdep_deps; i++) {
1053 ump->softdep_req = 1;
1054 if (ump->softdep_on_worklist)
1055 panic("softdep_waitidle: work added after flush.");
1056 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
1057 }
1058 ump->softdep_req = 0;
1059 FREE_LOCK(&lk);
1060 error = 0;
1061 if (i == 10) {
1062 error = EBUSY;
1063 printf("softdep_waitidle: Failed to flush worklist for %p\n",
1064 mp);
1065 }
1066
1067 return (error);
1068 }
1069
1070 /*
1071 * Flush all vnodes and worklist items associated with a specified mount point.
1072 */
1073 int
1074 softdep_flushfiles(oldmnt, flags, td)
1075 struct mount *oldmnt;
1076 int flags;
1077 struct thread *td;
1078 {
1079 int error, count, loopcnt;
1080
1081 error = 0;
1082
1083 /*
1084 * Alternately flush the vnodes associated with the mount
1085 * point and process any dependencies that the flushing
1086 * creates. In theory, this loop can happen at most twice,
1087 * but we give it a few extra just to be sure.
1088 */
1089 for (loopcnt = 10; loopcnt > 0; loopcnt--) {
1090 /*
1091 * Do another flush in case any vnodes were brought in
1092 * as part of the cleanup operations.
1093 */
1094 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
1095 break;
1096 if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
1097 count == 0)
1098 break;
1099 }
1100 /*
1101 * If we are unmounting then it is an error to fail. If we
1102 * are simply trying to downgrade to read-only, then filesystem
1103 * activity can keep us busy forever, so we just fail with EBUSY.
1104 */
1105 if (loopcnt == 0) {
1106 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
1107 panic("softdep_flushfiles: looping");
1108 error = EBUSY;
1109 }
1110 if (!error)
1111 error = softdep_waitidle(oldmnt);
1112 return (error);
1113 }
1114
1115 /*
1116 * Structure hashing.
1117 *
1118 * There are three types of structures that can be looked up:
1119 * 1) pagedep structures identified by mount point, inode number,
1120 * and logical block.
1121 * 2) inodedep structures identified by mount point and inode number.
1122 * 3) newblk structures identified by mount point and
1123 * physical block number.
1124 *
1125 * The "pagedep" and "inodedep" dependency structures are hashed
1126 * separately from the file blocks and inodes to which they correspond.
1127 * This separation helps when the in-memory copy of an inode or
1128 * file block must be replaced. It also obviates the need to access
1129 * an inode or file page when simply updating (or de-allocating)
1130 * dependency structures. Lookup of newblk structures is needed to
1131 * find newly allocated blocks when trying to associate them with
1132 * their allocdirect or allocindir structure.
1133 *
1134 * The lookup routines optionally create and hash a new instance when
1135 * an existing entry is not found.
1136 */
1137 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
1138 #define NODELAY 0x0002 /* cannot do background work */
1139
1140 /*
1141 * Structures and routines associated with pagedep caching.
1142 */
1143 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
1144 u_long pagedep_hash; /* size of hash table - 1 */
1145 #define PAGEDEP_HASH(mp, inum, lbn) \
1146 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
1147 pagedep_hash])
1148
1149 static int
1150 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
1151 struct pagedep_hashhead *pagedephd;
1152 ino_t ino;
1153 ufs_lbn_t lbn;
1154 struct mount *mp;
1155 int flags;
1156 struct pagedep **pagedeppp;
1157 {
1158 struct pagedep *pagedep;
1159
1160 LIST_FOREACH(pagedep, pagedephd, pd_hash)
1161 if (ino == pagedep->pd_ino &&
1162 lbn == pagedep->pd_lbn &&
1163 mp == pagedep->pd_list.wk_mp)
1164 break;
1165 if (pagedep) {
1166 *pagedeppp = pagedep;
1167 if ((flags & DEPALLOC) != 0 &&
1168 (pagedep->pd_state & ONWORKLIST) == 0)
1169 return (0);
1170 return (1);
1171 }
1172 *pagedeppp = NULL;
1173 return (0);
1174 }
1175 /*
1176 * Look up a pagedep. Return 1 if found, 0 if not found or found
1177 * when asked to allocate but not associated with any buffer.
1178 * If not found, allocate if DEPALLOC flag is passed.
1179 * Found or allocated entry is returned in pagedeppp.
1180 * This routine must be called with splbio interrupts blocked.
1181 */
1182 static int
1183 pagedep_lookup(ip, lbn, flags, pagedeppp)
1184 struct inode *ip;
1185 ufs_lbn_t lbn;
1186 int flags;
1187 struct pagedep **pagedeppp;
1188 {
1189 struct pagedep *pagedep;
1190 struct pagedep_hashhead *pagedephd;
1191 struct mount *mp;
1192 int ret;
1193 int i;
1194
1195 mtx_assert(&lk, MA_OWNED);
1196 mp = ITOV(ip)->v_mount;
1197 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1198
1199 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1200 if (*pagedeppp || (flags & DEPALLOC) == 0)
1201 return (ret);
1202 FREE_LOCK(&lk);
1203 MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
1204 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
1205 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
1206 ACQUIRE_LOCK(&lk);
1207 ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
1208 if (*pagedeppp) {
1209 WORKITEM_FREE(pagedep, D_PAGEDEP);
1210 return (ret);
1211 }
1212 pagedep->pd_ino = ip->i_number;
1213 pagedep->pd_lbn = lbn;
1214 LIST_INIT(&pagedep->pd_dirremhd);
1215 LIST_INIT(&pagedep->pd_pendinghd);
1216 for (i = 0; i < DAHASHSZ; i++)
1217 LIST_INIT(&pagedep->pd_diraddhd[i]);
1218 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1219 *pagedeppp = pagedep;
1220 return (0);
1221 }
1222
1223 /*
1224 * Structures and routines associated with inodedep caching.
1225 */
1226 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1227 static u_long inodedep_hash; /* size of hash table - 1 */
1228 static long num_inodedep; /* number of inodedep allocated */
1229 #define INODEDEP_HASH(fs, inum) \
1230 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1231
1232 static int
1233 inodedep_find(inodedephd, fs, inum, inodedeppp)
1234 struct inodedep_hashhead *inodedephd;
1235 struct fs *fs;
1236 ino_t inum;
1237 struct inodedep **inodedeppp;
1238 {
1239 struct inodedep *inodedep;
1240
1241 LIST_FOREACH(inodedep, inodedephd, id_hash)
1242 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1243 break;
1244 if (inodedep) {
1245 *inodedeppp = inodedep;
1246 return (1);
1247 }
1248 *inodedeppp = NULL;
1249
1250 return (0);
1251 }
1252 /*
1253 * Look up an inodedep. Return 1 if found, 0 if not found.
1254 * If not found, allocate if DEPALLOC flag is passed.
1255 * Found or allocated entry is returned in inodedeppp.
1256 * This routine must be called with splbio interrupts blocked.
1257 */
1258 static int
1259 inodedep_lookup(mp, inum, flags, inodedeppp)
1260 struct mount *mp;
1261 ino_t inum;
1262 int flags;
1263 struct inodedep **inodedeppp;
1264 {
1265 struct inodedep *inodedep;
1266 struct inodedep_hashhead *inodedephd;
1267 struct fs *fs;
1268
1269 mtx_assert(&lk, MA_OWNED);
1270 fs = VFSTOUFS(mp)->um_fs;
1271 inodedephd = INODEDEP_HASH(fs, inum);
1272
1273 if (inodedep_find(inodedephd, fs, inum, inodedeppp))
1274 return (1);
1275 if ((flags & DEPALLOC) == 0)
1276 return (0);
1277 /*
1278 * If we are over our limit, try to improve the situation.
1279 */
1280 if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
1281 request_cleanup(mp, FLUSH_INODES);
1282 FREE_LOCK(&lk);
1283 MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
1284 M_INODEDEP, M_SOFTDEP_FLAGS);
1285 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
1286 ACQUIRE_LOCK(&lk);
1287 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
1288 WORKITEM_FREE(inodedep, D_INODEDEP);
1289 return (1);
1290 }
1291 num_inodedep += 1;
1292 inodedep->id_fs = fs;
1293 inodedep->id_ino = inum;
1294 inodedep->id_state = ALLCOMPLETE;
1295 inodedep->id_nlinkdelta = 0;
1296 inodedep->id_savedino1 = NULL;
1297 inodedep->id_savedsize = -1;
1298 inodedep->id_savedextsize = -1;
1299 inodedep->id_buf = NULL;
1300 LIST_INIT(&inodedep->id_pendinghd);
1301 LIST_INIT(&inodedep->id_inowait);
1302 LIST_INIT(&inodedep->id_bufwait);
1303 TAILQ_INIT(&inodedep->id_inoupdt);
1304 TAILQ_INIT(&inodedep->id_newinoupdt);
1305 TAILQ_INIT(&inodedep->id_extupdt);
1306 TAILQ_INIT(&inodedep->id_newextupdt);
1307 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1308 *inodedeppp = inodedep;
1309 return (0);
1310 }
1311
1312 /*
1313 * Structures and routines associated with newblk caching.
1314 */
1315 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1316 u_long newblk_hash; /* size of hash table - 1 */
1317 #define NEWBLK_HASH(fs, inum) \
1318 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1319
1320 static int
1321 newblk_find(newblkhd, fs, newblkno, newblkpp)
1322 struct newblk_hashhead *newblkhd;
1323 struct fs *fs;
1324 ufs2_daddr_t newblkno;
1325 struct newblk **newblkpp;
1326 {
1327 struct newblk *newblk;
1328
1329 LIST_FOREACH(newblk, newblkhd, nb_hash)
1330 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1331 break;
1332 if (newblk) {
1333 *newblkpp = newblk;
1334 return (1);
1335 }
1336 *newblkpp = NULL;
1337 return (0);
1338 }
1339
1340 /*
1341 * Look up a newblk. Return 1 if found, 0 if not found.
1342 * If not found, allocate if DEPALLOC flag is passed.
1343 * Found or allocated entry is returned in newblkpp.
1344 */
1345 static int
1346 newblk_lookup(fs, newblkno, flags, newblkpp)
1347 struct fs *fs;
1348 ufs2_daddr_t newblkno;
1349 int flags;
1350 struct newblk **newblkpp;
1351 {
1352 struct newblk *newblk;
1353 struct newblk_hashhead *newblkhd;
1354
1355 newblkhd = NEWBLK_HASH(fs, newblkno);
1356 if (newblk_find(newblkhd, fs, newblkno, newblkpp))
1357 return (1);
1358 if ((flags & DEPALLOC) == 0)
1359 return (0);
1360 FREE_LOCK(&lk);
1361 MALLOC(newblk, struct newblk *, sizeof(struct newblk),
1362 M_NEWBLK, M_SOFTDEP_FLAGS);
1363 ACQUIRE_LOCK(&lk);
1364 if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
1365 FREE(newblk, M_NEWBLK);
1366 return (1);
1367 }
1368 newblk->nb_state = 0;
1369 newblk->nb_fs = fs;
1370 newblk->nb_newblkno = newblkno;
1371 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1372 *newblkpp = newblk;
1373 return (0);
1374 }
1375
1376 /*
1377 * Executed during filesystem system initialization before
1378 * mounting any filesystems.
1379 */
1380 void
1381 softdep_initialize()
1382 {
1383
1384 LIST_INIT(&mkdirlisthd);
1385 max_softdeps = desiredvnodes * 4;
1386 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
1387 &pagedep_hash);
1388 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
1389 newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
1390
1391 /* initialise bioops hack */
1392 bioops.io_start = softdep_disk_io_initiation;
1393 bioops.io_complete = softdep_disk_write_complete;
1394 bioops.io_deallocate = softdep_deallocate_dependencies;
1395 bioops.io_countdeps = softdep_count_dependencies;
1396 }
1397
1398 /*
1399 * Executed after all filesystems have been unmounted during
1400 * filesystem module unload.
1401 */
1402 void
1403 softdep_uninitialize()
1404 {
1405
1406 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
1407 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
1408 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
1409 }
1410
1411 /*
1412 * Called at mount time to notify the dependency code that a
1413 * filesystem wishes to use it.
1414 */
1415 int
1416 softdep_mount(devvp, mp, fs, cred)
1417 struct vnode *devvp;
1418 struct mount *mp;
1419 struct fs *fs;
1420 struct ucred *cred;
1421 {
1422 struct csum_total cstotal;
1423 struct ufsmount *ump;
1424 struct cg *cgp;
1425 struct buf *bp;
1426 int error, cyl;
1427
1428 MNT_ILOCK(mp);
1429 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
1430 MNT_IUNLOCK(mp);
1431 ump = VFSTOUFS(mp);
1432 LIST_INIT(&ump->softdep_workitem_pending);
1433 ump->softdep_worklist_tail = NULL;
1434 ump->softdep_on_worklist = 0;
1435 ump->softdep_deps = 0;
1436 /*
1437 * When doing soft updates, the counters in the
1438 * superblock may have gotten out of sync. Recomputation
1439 * can take a long time and can be deferred for background
1440 * fsck. However, the old behavior of scanning the cylinder
1441 * groups and recalculating them at mount time is available
1442 * by setting vfs.ffs.compute_summary_at_mount to one.
1443 */
1444 if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
1445 return (0);
1446 bzero(&cstotal, sizeof cstotal);
1447 for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1448 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1449 fs->fs_cgsize, cred, &bp)) != 0) {
1450 brelse(bp);
1451 return (error);
1452 }
1453 cgp = (struct cg *)bp->b_data;
1454 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1455 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1456 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1457 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1458 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1459 brelse(bp);
1460 }
1461 #ifdef DEBUG
1462 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1463 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
1464 #endif
1465 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1466 return (0);
1467 }
1468
1469 /*
1470 * Protecting the freemaps (or bitmaps).
1471 *
1472 * To eliminate the need to execute fsck before mounting a filesystem
1473 * after a power failure, one must (conservatively) guarantee that the
1474 * on-disk copy of the bitmaps never indicate that a live inode or block is
1475 * free. So, when a block or inode is allocated, the bitmap should be
1476 * updated (on disk) before any new pointers. When a block or inode is
1477 * freed, the bitmap should not be updated until all pointers have been
1478 * reset. The latter dependency is handled by the delayed de-allocation
1479 * approach described below for block and inode de-allocation. The former
1480 * dependency is handled by calling the following procedure when a block or
1481 * inode is allocated. When an inode is allocated an "inodedep" is created
1482 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1483 * Each "inodedep" is also inserted into the hash indexing structure so
1484 * that any additional link additions can be made dependent on the inode
1485 * allocation.
1486 *
1487 * The ufs filesystem maintains a number of free block counts (e.g., per
1488 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1489 * in addition to the bitmaps. These counts are used to improve efficiency
1490 * during allocation and therefore must be consistent with the bitmaps.
1491 * There is no convenient way to guarantee post-crash consistency of these
1492 * counts with simple update ordering, for two main reasons: (1) The counts
1493 * and bitmaps for a single cylinder group block are not in the same disk
1494 * sector. If a disk write is interrupted (e.g., by power failure), one may
1495 * be written and the other not. (2) Some of the counts are located in the
1496 * superblock rather than the cylinder group block. So, we focus our soft
1497 * updates implementation on protecting the bitmaps. When mounting a
1498 * filesystem, we recompute the auxiliary counts from the bitmaps.
1499 */
1500
1501 /*
1502 * Called just after updating the cylinder group block to allocate an inode.
1503 */
1504 void
1505 softdep_setup_inomapdep(bp, ip, newinum)
1506 struct buf *bp; /* buffer for cylgroup block with inode map */
1507 struct inode *ip; /* inode related to allocation */
1508 ino_t newinum; /* new inode number being allocated */
1509 {
1510 struct inodedep *inodedep;
1511 struct bmsafemap *bmsafemap;
1512
1513 /*
1514 * Create a dependency for the newly allocated inode.
1515 * Panic if it already exists as something is seriously wrong.
1516 * Otherwise add it to the dependency list for the buffer holding
1517 * the cylinder group map from which it was allocated.
1518 */
1519 ACQUIRE_LOCK(&lk);
1520 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
1521 &inodedep)))
1522 panic("softdep_setup_inomapdep: dependency for new inode "
1523 "already exists");
1524 inodedep->id_buf = bp;
1525 inodedep->id_state &= ~DEPCOMPLETE;
1526 bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
1527 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1528 FREE_LOCK(&lk);
1529 }
1530
1531 /*
1532 * Called just after updating the cylinder group block to
1533 * allocate block or fragment.
1534 */
1535 void
1536 softdep_setup_blkmapdep(bp, mp, newblkno)
1537 struct buf *bp; /* buffer for cylgroup block with block map */
1538 struct mount *mp; /* filesystem doing allocation */
1539 ufs2_daddr_t newblkno; /* number of newly allocated block */
1540 {
1541 struct newblk *newblk;
1542 struct bmsafemap *bmsafemap;
1543 struct fs *fs;
1544
1545 fs = VFSTOUFS(mp)->um_fs;
1546 /*
1547 * Create a dependency for the newly allocated block.
1548 * Add it to the dependency list for the buffer holding
1549 * the cylinder group map from which it was allocated.
1550 */
1551 ACQUIRE_LOCK(&lk);
1552 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1553 panic("softdep_setup_blkmapdep: found block");
1554 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
1555 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1556 FREE_LOCK(&lk);
1557 }
1558
1559 /*
1560 * Find the bmsafemap associated with a cylinder group buffer.
1561 * If none exists, create one. The buffer must be locked when
1562 * this routine is called and this routine must be called with
1563 * splbio interrupts blocked.
1564 */
1565 static struct bmsafemap *
1566 bmsafemap_lookup(mp, bp)
1567 struct mount *mp;
1568 struct buf *bp;
1569 {
1570 struct bmsafemap *bmsafemap;
1571 struct worklist *wk;
1572
1573 mtx_assert(&lk, MA_OWNED);
1574 LIST_FOREACH(wk, &bp->b_dep, wk_list)
1575 if (wk->wk_type == D_BMSAFEMAP)
1576 return (WK_BMSAFEMAP(wk));
1577 FREE_LOCK(&lk);
1578 MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
1579 M_BMSAFEMAP, M_SOFTDEP_FLAGS);
1580 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
1581 bmsafemap->sm_buf = bp;
1582 LIST_INIT(&bmsafemap->sm_allocdirecthd);
1583 LIST_INIT(&bmsafemap->sm_allocindirhd);
1584 LIST_INIT(&bmsafemap->sm_inodedephd);
1585 LIST_INIT(&bmsafemap->sm_newblkhd);
1586 ACQUIRE_LOCK(&lk);
1587 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1588 return (bmsafemap);
1589 }
1590
1591 /*
1592 * Direct block allocation dependencies.
1593 *
1594 * When a new block is allocated, the corresponding disk locations must be
1595 * initialized (with zeros or new data) before the on-disk inode points to
1596 * them. Also, the freemap from which the block was allocated must be
1597 * updated (on disk) before the inode's pointer. These two dependencies are
1598 * independent of each other and are needed for all file blocks and indirect
1599 * blocks that are pointed to directly by the inode. Just before the
1600 * "in-core" version of the inode is updated with a newly allocated block
1601 * number, a procedure (below) is called to setup allocation dependency
1602 * structures. These structures are removed when the corresponding
1603 * dependencies are satisfied or when the block allocation becomes obsolete
1604 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1605 * fragment that gets upgraded). All of these cases are handled in
1606 * procedures described later.
1607 *
1608 * When a file extension causes a fragment to be upgraded, either to a larger
1609 * fragment or to a full block, the on-disk location may change (if the
1610 * previous fragment could not simply be extended). In this case, the old
1611 * fragment must be de-allocated, but not until after the inode's pointer has
1612 * been updated. In most cases, this is handled by later procedures, which
1613 * will construct a "freefrag" structure to be added to the workitem queue
1614 * when the inode update is complete (or obsolete). The main exception to
1615 * this is when an allocation occurs while a pending allocation dependency
1616 * (for the same block pointer) remains. This case is handled in the main
1617 * allocation dependency setup procedure by immediately freeing the
1618 * unreferenced fragments.
1619 */
1620 void
1621 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1622 struct inode *ip; /* inode to which block is being added */
1623 ufs_lbn_t lbn; /* block pointer within inode */
1624 ufs2_daddr_t newblkno; /* disk block number being added */
1625 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
1626 long newsize; /* size of new block */
1627 long oldsize; /* size of new block */
1628 struct buf *bp; /* bp for allocated block */
1629 {
1630 struct allocdirect *adp, *oldadp;
1631 struct allocdirectlst *adphead;
1632 struct bmsafemap *bmsafemap;
1633 struct inodedep *inodedep;
1634 struct pagedep *pagedep;
1635 struct newblk *newblk;
1636 struct mount *mp;
1637
1638 mp = UFSTOVFS(ip->i_ump);
1639 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1640 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1641 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1642 adp->ad_lbn = lbn;
1643 adp->ad_newblkno = newblkno;
1644 adp->ad_oldblkno = oldblkno;
1645 adp->ad_newsize = newsize;
1646 adp->ad_oldsize = oldsize;
1647 adp->ad_state = ATTACHED;
1648 LIST_INIT(&adp->ad_newdirblk);
1649 if (newblkno == oldblkno)
1650 adp->ad_freefrag = NULL;
1651 else
1652 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1653
1654 ACQUIRE_LOCK(&lk);
1655 if (lbn >= NDADDR) {
1656 /* allocating an indirect block */
1657 if (oldblkno != 0)
1658 panic("softdep_setup_allocdirect: non-zero indir");
1659 } else {
1660 /*
1661 * Allocating a direct block.
1662 *
1663 * If we are allocating a directory block, then we must
1664 * allocate an associated pagedep to track additions and
1665 * deletions.
1666 */
1667 if ((ip->i_mode & IFMT) == IFDIR &&
1668 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1669 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1670 }
1671 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1672 panic("softdep_setup_allocdirect: lost block");
1673 if (newblk->nb_state == DEPCOMPLETE) {
1674 adp->ad_state |= DEPCOMPLETE;
1675 adp->ad_buf = NULL;
1676 } else {
1677 bmsafemap = newblk->nb_bmsafemap;
1678 adp->ad_buf = bmsafemap->sm_buf;
1679 LIST_REMOVE(newblk, nb_deps);
1680 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1681 }
1682 LIST_REMOVE(newblk, nb_hash);
1683 FREE(newblk, M_NEWBLK);
1684
1685 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1686 adp->ad_inodedep = inodedep;
1687 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1688 /*
1689 * The list of allocdirects must be kept in sorted and ascending
1690 * order so that the rollback routines can quickly determine the
1691 * first uncommitted block (the size of the file stored on disk
1692 * ends at the end of the lowest committed fragment, or if there
1693 * are no fragments, at the end of the highest committed block).
1694 * Since files generally grow, the typical case is that the new
1695 * block is to be added at the end of the list. We speed this
1696 * special case by checking against the last allocdirect in the
1697 * list before laboriously traversing the list looking for the
1698 * insertion point.
1699 */
1700 adphead = &inodedep->id_newinoupdt;
1701 oldadp = TAILQ_LAST(adphead, allocdirectlst);
1702 if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1703 /* insert at end of list */
1704 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1705 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1706 allocdirect_merge(adphead, adp, oldadp);
1707 FREE_LOCK(&lk);
1708 return;
1709 }
1710 TAILQ_FOREACH(oldadp, adphead, ad_next) {
1711 if (oldadp->ad_lbn >= lbn)
1712 break;
1713 }
1714 if (oldadp == NULL)
1715 panic("softdep_setup_allocdirect: lost entry");
1716 /* insert in middle of list */
1717 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1718 if (oldadp->ad_lbn == lbn)
1719 allocdirect_merge(adphead, adp, oldadp);
1720 FREE_LOCK(&lk);
1721 }
1722
1723 /*
1724 * Replace an old allocdirect dependency with a newer one.
1725 * This routine must be called with splbio interrupts blocked.
1726 */
1727 static void
1728 allocdirect_merge(adphead, newadp, oldadp)
1729 struct allocdirectlst *adphead; /* head of list holding allocdirects */
1730 struct allocdirect *newadp; /* allocdirect being added */
1731 struct allocdirect *oldadp; /* existing allocdirect being checked */
1732 {
1733 struct worklist *wk;
1734 struct freefrag *freefrag;
1735 struct newdirblk *newdirblk;
1736
1737 mtx_assert(&lk, MA_OWNED);
1738 if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1739 newadp->ad_oldsize != oldadp->ad_newsize ||
1740 newadp->ad_lbn >= NDADDR)
1741 panic("%s %jd != new %jd || old size %ld != new %ld",
1742 "allocdirect_merge: old blkno",
1743 (intmax_t)newadp->ad_oldblkno,
1744 (intmax_t)oldadp->ad_newblkno,
1745 newadp->ad_oldsize, oldadp->ad_newsize);
1746 newadp->ad_oldblkno = oldadp->ad_oldblkno;
1747 newadp->ad_oldsize = oldadp->ad_oldsize;
1748 /*
1749 * If the old dependency had a fragment to free or had never
1750 * previously had a block allocated, then the new dependency
1751 * can immediately post its freefrag and adopt the old freefrag.
1752 * This action is done by swapping the freefrag dependencies.
1753 * The new dependency gains the old one's freefrag, and the
1754 * old one gets the new one and then immediately puts it on
1755 * the worklist when it is freed by free_allocdirect. It is
1756 * not possible to do this swap when the old dependency had a
1757 * non-zero size but no previous fragment to free. This condition
1758 * arises when the new block is an extension of the old block.
1759 * Here, the first part of the fragment allocated to the new
1760 * dependency is part of the block currently claimed on disk by
1761 * the old dependency, so cannot legitimately be freed until the
1762 * conditions for the new dependency are fulfilled.
1763 */
1764 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1765 freefrag = newadp->ad_freefrag;
1766 newadp->ad_freefrag = oldadp->ad_freefrag;
1767 oldadp->ad_freefrag = freefrag;
1768 }
1769 /*
1770 * If we are tracking a new directory-block allocation,
1771 * move it from the old allocdirect to the new allocdirect.
1772 */
1773 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1774 newdirblk = WK_NEWDIRBLK(wk);
1775 WORKLIST_REMOVE(&newdirblk->db_list);
1776 if (!LIST_EMPTY(&oldadp->ad_newdirblk))
1777 panic("allocdirect_merge: extra newdirblk");
1778 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1779 }
1780 free_allocdirect(adphead, oldadp, 0);
1781 }
1782
1783 /*
1784 * Allocate a new freefrag structure if needed.
1785 */
1786 static struct freefrag *
1787 newfreefrag(ip, blkno, size)
1788 struct inode *ip;
1789 ufs2_daddr_t blkno;
1790 long size;
1791 {
1792 struct freefrag *freefrag;
1793 struct fs *fs;
1794
1795 if (blkno == 0)
1796 return (NULL);
1797 fs = ip->i_fs;
1798 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1799 panic("newfreefrag: frag size");
1800 MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
1801 M_FREEFRAG, M_SOFTDEP_FLAGS);
1802 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
1803 freefrag->ff_inum = ip->i_number;
1804 freefrag->ff_blkno = blkno;
1805 freefrag->ff_fragsize = size;
1806 return (freefrag);
1807 }
1808
1809 /*
1810 * This workitem de-allocates fragments that were replaced during
1811 * file block allocation.
1812 */
1813 static void
1814 handle_workitem_freefrag(freefrag)
1815 struct freefrag *freefrag;
1816 {
1817 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
1818
1819 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
1820 freefrag->ff_fragsize, freefrag->ff_inum);
1821 ACQUIRE_LOCK(&lk);
1822 WORKITEM_FREE(freefrag, D_FREEFRAG);
1823 FREE_LOCK(&lk);
1824 }
1825
1826 /*
1827 * Set up a dependency structure for an external attributes data block.
1828 * This routine follows much of the structure of softdep_setup_allocdirect.
1829 * See the description of softdep_setup_allocdirect above for details.
1830 */
1831 void
1832 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1833 struct inode *ip;
1834 ufs_lbn_t lbn;
1835 ufs2_daddr_t newblkno;
1836 ufs2_daddr_t oldblkno;
1837 long newsize;
1838 long oldsize;
1839 struct buf *bp;
1840 {
1841 struct allocdirect *adp, *oldadp;
1842 struct allocdirectlst *adphead;
1843 struct bmsafemap *bmsafemap;
1844 struct inodedep *inodedep;
1845 struct newblk *newblk;
1846 struct mount *mp;
1847
1848 mp = UFSTOVFS(ip->i_ump);
1849 MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
1850 M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
1851 workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
1852 adp->ad_lbn = lbn;
1853 adp->ad_newblkno = newblkno;
1854 adp->ad_oldblkno = oldblkno;
1855 adp->ad_newsize = newsize;
1856 adp->ad_oldsize = oldsize;
1857 adp->ad_state = ATTACHED | EXTDATA;
1858 LIST_INIT(&adp->ad_newdirblk);
1859 if (newblkno == oldblkno)
1860 adp->ad_freefrag = NULL;
1861 else
1862 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1863
1864 ACQUIRE_LOCK(&lk);
1865 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1866 panic("softdep_setup_allocext: lost block");
1867
1868 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1869 adp->ad_inodedep = inodedep;
1870
1871 if (newblk->nb_state == DEPCOMPLETE) {
1872 adp->ad_state |= DEPCOMPLETE;
1873 adp->ad_buf = NULL;
1874 } else {
1875 bmsafemap = newblk->nb_bmsafemap;
1876 adp->ad_buf = bmsafemap->sm_buf;
1877 LIST_REMOVE(newblk, nb_deps);
1878 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1879 }
1880 LIST_REMOVE(newblk, nb_hash);
1881 FREE(newblk, M_NEWBLK);
1882
1883 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1884 if (lbn >= NXADDR)
1885 panic("softdep_setup_allocext: lbn %lld > NXADDR",
1886 (long long)lbn);
1887 /*
1888 * The list of allocdirects must be kept in sorted and ascending
1889 * order so that the rollback routines can quickly determine the
1890 * first uncommitted block (the size of the file stored on disk
1891 * ends at the end of the lowest committed fragment, or if there
1892 * are no fragments, at the end of the highest committed block).
1893 * Since files generally grow, the typical case is that the new
1894 * block is to be added at the end of the list. We speed this
1895 * special case by checking against the last allocdirect in the
1896 * list before laboriously traversing the list looking for the
1897 * insertion point.
1898 */
1899 adphead = &inodedep->id_newextupdt;
1900 oldadp = TAILQ_LAST(adphead, allocdirectlst);
1901 if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1902 /* insert at end of list */
1903 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1904 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1905 allocdirect_merge(adphead, adp, oldadp);
1906 FREE_LOCK(&lk);
1907 return;
1908 }
1909 TAILQ_FOREACH(oldadp, adphead, ad_next) {
1910 if (oldadp->ad_lbn >= lbn)
1911 break;
1912 }
1913 if (oldadp == NULL)
1914 panic("softdep_setup_allocext: lost entry");
1915 /* insert in middle of list */
1916 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1917 if (oldadp->ad_lbn == lbn)
1918 allocdirect_merge(adphead, adp, oldadp);
1919 FREE_LOCK(&lk);
1920 }
1921
1922 /*
1923 * Indirect block allocation dependencies.
1924 *
1925 * The same dependencies that exist for a direct block also exist when
1926 * a new block is allocated and pointed to by an entry in a block of
1927 * indirect pointers. The undo/redo states described above are also
1928 * used here. Because an indirect block contains many pointers that
1929 * may have dependencies, a second copy of the entire in-memory indirect
1930 * block is kept. The buffer cache copy is always completely up-to-date.
1931 * The second copy, which is used only as a source for disk writes,
1932 * contains only the safe pointers (i.e., those that have no remaining
1933 * update dependencies). The second copy is freed when all pointers
1934 * are safe. The cache is not allowed to replace indirect blocks with
1935 * pending update dependencies. If a buffer containing an indirect
1936 * block with dependencies is written, these routines will mark it
1937 * dirty again. It can only be successfully written once all the
1938 * dependencies are removed. The ffs_fsync routine in conjunction with
1939 * softdep_sync_metadata work together to get all the dependencies
1940 * removed so that a file can be successfully written to disk. Three
1941 * procedures are used when setting up indirect block pointer
1942 * dependencies. The division is necessary because of the organization
1943 * of the "balloc" routine and because of the distinction between file
1944 * pages and file metadata blocks.
1945 */
1946
1947 /*
1948 * Allocate a new allocindir structure.
1949 */
1950 static struct allocindir *
1951 newallocindir(ip, ptrno, newblkno, oldblkno)
1952 struct inode *ip; /* inode for file being extended */
1953 int ptrno; /* offset of pointer in indirect block */
1954 ufs2_daddr_t newblkno; /* disk block number being added */
1955 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
1956 {
1957 struct allocindir *aip;
1958
1959 MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
1960 M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
1961 workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
1962 aip->ai_state = ATTACHED;
1963 aip->ai_offset = ptrno;
1964 aip->ai_newblkno = newblkno;
1965 aip->ai_oldblkno = oldblkno;
1966 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1967 return (aip);
1968 }
1969
1970 /*
1971 * Called just before setting an indirect block pointer
1972 * to a newly allocated file page.
1973 */
1974 void
1975 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1976 struct inode *ip; /* inode for file being extended */
1977 ufs_lbn_t lbn; /* allocated block number within file */
1978 struct buf *bp; /* buffer with indirect blk referencing page */
1979 int ptrno; /* offset of pointer in indirect block */
1980 ufs2_daddr_t newblkno; /* disk block number being added */
1981 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
1982 struct buf *nbp; /* buffer holding allocated page */
1983 {
1984 struct allocindir *aip;
1985 struct pagedep *pagedep;
1986
1987 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
1988 aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1989 ACQUIRE_LOCK(&lk);
1990 /*
1991 * If we are allocating a directory page, then we must
1992 * allocate an associated pagedep to track additions and
1993 * deletions.
1994 */
1995 if ((ip->i_mode & IFMT) == IFDIR &&
1996 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1997 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1998 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1999 setup_allocindir_phase2(bp, ip, aip);
2000 FREE_LOCK(&lk);
2001 }
2002
2003 /*
2004 * Called just before setting an indirect block pointer to a
2005 * newly allocated indirect block.
2006 */
2007 void
2008 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
2009 struct buf *nbp; /* newly allocated indirect block */
2010 struct inode *ip; /* inode for file being extended */
2011 struct buf *bp; /* indirect block referencing allocated block */
2012 int ptrno; /* offset of pointer in indirect block */
2013 ufs2_daddr_t newblkno; /* disk block number being added */
2014 {
2015 struct allocindir *aip;
2016
2017 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
2018 aip = newallocindir(ip, ptrno, newblkno, 0);
2019 ACQUIRE_LOCK(&lk);
2020 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
2021 setup_allocindir_phase2(bp, ip, aip);
2022 FREE_LOCK(&lk);
2023 }
2024
2025 /*
2026 * Called to finish the allocation of the "aip" allocated
2027 * by one of the two routines above.
2028 */
2029 static void
2030 setup_allocindir_phase2(bp, ip, aip)
2031 struct buf *bp; /* in-memory copy of the indirect block */
2032 struct inode *ip; /* inode for file being extended */
2033 struct allocindir *aip; /* allocindir allocated by the above routines */
2034 {
2035 struct worklist *wk;
2036 struct indirdep *indirdep, *newindirdep;
2037 struct bmsafemap *bmsafemap;
2038 struct allocindir *oldaip;
2039 struct freefrag *freefrag;
2040 struct newblk *newblk;
2041 ufs2_daddr_t blkno;
2042
2043 mtx_assert(&lk, MA_OWNED);
2044 if (bp->b_lblkno >= 0)
2045 panic("setup_allocindir_phase2: not indir blk");
2046 for (indirdep = NULL, newindirdep = NULL; ; ) {
2047 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2048 if (wk->wk_type != D_INDIRDEP)
2049 continue;
2050 indirdep = WK_INDIRDEP(wk);
2051 break;
2052 }
2053 if (indirdep == NULL && newindirdep) {
2054 indirdep = newindirdep;
2055 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
2056 newindirdep = NULL;
2057 }
2058 if (indirdep) {
2059 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
2060 &newblk) == 0)
2061 panic("setup_allocindir: lost block");
2062 if (newblk->nb_state == DEPCOMPLETE) {
2063 aip->ai_state |= DEPCOMPLETE;
2064 aip->ai_buf = NULL;
2065 } else {
2066 bmsafemap = newblk->nb_bmsafemap;
2067 aip->ai_buf = bmsafemap->sm_buf;
2068 LIST_REMOVE(newblk, nb_deps);
2069 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
2070 aip, ai_deps);
2071 }
2072 LIST_REMOVE(newblk, nb_hash);
2073 FREE(newblk, M_NEWBLK);
2074 aip->ai_indirdep = indirdep;
2075 /*
2076 * Check to see if there is an existing dependency
2077 * for this block. If there is, merge the old
2078 * dependency into the new one.
2079 */
2080 if (aip->ai_oldblkno == 0)
2081 oldaip = NULL;
2082 else
2083
2084 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
2085 if (oldaip->ai_offset == aip->ai_offset)
2086 break;
2087 freefrag = NULL;
2088 if (oldaip != NULL) {
2089 if (oldaip->ai_newblkno != aip->ai_oldblkno)
2090 panic("setup_allocindir_phase2: blkno");
2091 aip->ai_oldblkno = oldaip->ai_oldblkno;
2092 freefrag = aip->ai_freefrag;
2093 aip->ai_freefrag = oldaip->ai_freefrag;
2094 oldaip->ai_freefrag = NULL;
2095 free_allocindir(oldaip, NULL);
2096 }
2097 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
2098 if (ip->i_ump->um_fstype == UFS1)
2099 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
2100 [aip->ai_offset] = aip->ai_oldblkno;
2101 else
2102 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
2103 [aip->ai_offset] = aip->ai_oldblkno;
2104 FREE_LOCK(&lk);
2105 if (freefrag != NULL)
2106 handle_workitem_freefrag(freefrag);
2107 } else
2108 FREE_LOCK(&lk);
2109 if (newindirdep) {
2110 newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
2111 brelse(newindirdep->ir_savebp);
2112 ACQUIRE_LOCK(&lk);
2113 WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
2114 if (indirdep)
2115 break;
2116 FREE_LOCK(&lk);
2117 }
2118 if (indirdep) {
2119 ACQUIRE_LOCK(&lk);
2120 break;
2121 }
2122 MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
2123 M_INDIRDEP, M_SOFTDEP_FLAGS);
2124 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
2125 UFSTOVFS(ip->i_ump));
2126 newindirdep->ir_state = ATTACHED;
2127 if (ip->i_ump->um_fstype == UFS1)
2128 newindirdep->ir_state |= UFS1FMT;
2129 LIST_INIT(&newindirdep->ir_deplisthd);
2130 LIST_INIT(&newindirdep->ir_donehd);
2131 if (bp->b_blkno == bp->b_lblkno) {
2132 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
2133 NULL, NULL);
2134 bp->b_blkno = blkno;
2135 }
2136 newindirdep->ir_savebp =
2137 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
2138 BUF_KERNPROC(newindirdep->ir_savebp);
2139 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
2140 ACQUIRE_LOCK(&lk);
2141 }
2142 }
2143
2144 /*
2145 * Block de-allocation dependencies.
2146 *
2147 * When blocks are de-allocated, the on-disk pointers must be nullified before
2148 * the blocks are made available for use by other files. (The true
2149 * requirement is that old pointers must be nullified before new on-disk
2150 * pointers are set. We chose this slightly more stringent requirement to
2151 * reduce complexity.) Our implementation handles this dependency by updating
2152 * the inode (or indirect block) appropriately but delaying the actual block
2153 * de-allocation (i.e., freemap and free space count manipulation) until
2154 * after the updated versions reach stable storage. After the disk is
2155 * updated, the blocks can be safely de-allocated whenever it is convenient.
2156 * This implementation handles only the common case of reducing a file's
2157 * length to zero. Other cases are handled by the conventional synchronous
2158 * write approach.
2159 *
2160 * The ffs implementation with which we worked double-checks
2161 * the state of the block pointers and file size as it reduces
2162 * a file's length. Some of this code is replicated here in our
2163 * soft updates implementation. The freeblks->fb_chkcnt field is
2164 * used to transfer a part of this information to the procedure
2165 * that eventually de-allocates the blocks.
2166 *
2167 * This routine should be called from the routine that shortens
2168 * a file's length, before the inode's size or block pointers
2169 * are modified. It will save the block pointer information for
2170 * later release and zero the inode so that the calling routine
2171 * can release it.
2172 */
2173 void
2174 softdep_setup_freeblocks(ip, length, flags)
2175 struct inode *ip; /* The inode whose length is to be reduced */
2176 off_t length; /* The new length for the file */
2177 int flags; /* IO_EXT and/or IO_NORMAL */
2178 {
2179 struct freeblks *freeblks;
2180 struct inodedep *inodedep;
2181 struct allocdirect *adp;
2182 struct vnode *vp;
2183 struct buf *bp;
2184 struct fs *fs;
2185 ufs2_daddr_t extblocks, datablocks;
2186 struct mount *mp;
2187 int i, delay, error;
2188
2189 fs = ip->i_fs;
2190 mp = UFSTOVFS(ip->i_ump);
2191 if (length != 0)
2192 panic("softdep_setup_freeblocks: non-zero length");
2193 MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
2194 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
2195 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
2196 freeblks->fb_state = ATTACHED;
2197 freeblks->fb_uid = ip->i_uid;
2198 freeblks->fb_previousinum = ip->i_number;
2199 freeblks->fb_devvp = ip->i_devvp;
2200 ACQUIRE_LOCK(&lk);
2201 num_freeblkdep++;
2202 FREE_LOCK(&lk);
2203 extblocks = 0;
2204 if (fs->fs_magic == FS_UFS2_MAGIC)
2205 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
2206 datablocks = DIP(ip, i_blocks) - extblocks;
2207 if ((flags & IO_NORMAL) == 0) {
2208 freeblks->fb_oldsize = 0;
2209 freeblks->fb_chkcnt = 0;
2210 } else {
2211 freeblks->fb_oldsize = ip->i_size;
2212 ip->i_size = 0;
2213 DIP_SET(ip, i_size, 0);
2214 freeblks->fb_chkcnt = datablocks;
2215 for (i = 0; i < NDADDR; i++) {
2216 freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
2217 DIP_SET(ip, i_db[i], 0);
2218 }
2219 for (i = 0; i < NIADDR; i++) {
2220 freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
2221 DIP_SET(ip, i_ib[i], 0);
2222 }
2223 /*
2224 * If the file was removed, then the space being freed was
2225 * accounted for then (see softdep_releasefile()). If the
2226 * file is merely being truncated, then we account for it now.
2227 */
2228 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2229 UFS_LOCK(ip->i_ump);
2230 fs->fs_pendingblocks += datablocks;
2231 UFS_UNLOCK(ip->i_ump);
2232 }
2233 }
2234 if ((flags & IO_EXT) == 0) {
2235 freeblks->fb_oldextsize = 0;
2236 } else {
2237 freeblks->fb_oldextsize = ip->i_din2->di_extsize;
2238 ip->i_din2->di_extsize = 0;
2239 freeblks->fb_chkcnt += extblocks;
2240 for (i = 0; i < NXADDR; i++) {
2241 freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
2242 ip->i_din2->di_extb[i] = 0;
2243 }
2244 }
2245 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
2246 /*
2247 * Push the zero'ed inode to to its disk buffer so that we are free
2248 * to delete its dependencies below. Once the dependencies are gone
2249 * the buffer can be safely released.
2250 */
2251 if ((error = bread(ip->i_devvp,
2252 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
2253 (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
2254 brelse(bp);
2255 softdep_error("softdep_setup_freeblocks", error);
2256 }
2257 if (ip->i_ump->um_fstype == UFS1)
2258 *((struct ufs1_dinode *)bp->b_data +
2259 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
2260 else
2261 *((struct ufs2_dinode *)bp->b_data +
2262 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
2263 /*
2264 * Find and eliminate any inode dependencies.
2265 */
2266 ACQUIRE_LOCK(&lk);
2267 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
2268 if ((inodedep->id_state & IOSTARTED) != 0)
2269 panic("softdep_setup_freeblocks: inode busy");
2270 /*
2271 * Add the freeblks structure to the list of operations that
2272 * must await the zero'ed inode being written to disk. If we
2273 * still have a bitmap dependency (delay == 0), then the inode
2274 * has never been written to disk, so we can process the
2275 * freeblks below once we have deleted the dependencies.
2276 */
2277 delay = (inodedep->id_state & DEPCOMPLETE);
2278 if (delay)
2279 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
2280 /*
2281 * Because the file length has been truncated to zero, any
2282 * pending block allocation dependency structures associated
2283 * with this inode are obsolete and can simply be de-allocated.
2284 * We must first merge the two dependency lists to get rid of
2285 * any duplicate freefrag structures, then purge the merged list.
2286 * If we still have a bitmap dependency, then the inode has never
2287 * been written to disk, so we can free any fragments without delay.
2288 */
2289 if (flags & IO_NORMAL) {
2290 merge_inode_lists(&inodedep->id_newinoupdt,
2291 &inodedep->id_inoupdt);
2292 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2293 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2294 }
2295 if (flags & IO_EXT) {
2296 merge_inode_lists(&inodedep->id_newextupdt,
2297 &inodedep->id_extupdt);
2298 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
2299 free_allocdirect(&inodedep->id_extupdt, adp, delay);
2300 }
2301 FREE_LOCK(&lk);
2302 bdwrite(bp);
2303 /*
2304 * We must wait for any I/O in progress to finish so that
2305 * all potential buffers on the dirty list will be visible.
2306 * Once they are all there, walk the list and get rid of
2307 * any dependencies.
2308 */
2309 vp = ITOV(ip);
2310 VI_LOCK(vp);
2311 drain_output(vp);
2312 restart:
2313 TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
2314 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
2315 ((flags & IO_NORMAL) == 0 &&
2316 (bp->b_xflags & BX_ALTDATA) == 0))
2317 continue;
2318 if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
2319 goto restart;
2320 VI_UNLOCK(vp);
2321 ACQUIRE_LOCK(&lk);
2322 (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
2323 deallocate_dependencies(bp, inodedep);
2324 FREE_LOCK(&lk);
2325 bp->b_flags |= B_INVAL | B_NOCACHE;
2326 brelse(bp);
2327 VI_LOCK(vp);
2328 goto restart;
2329 }
2330 VI_UNLOCK(vp);
2331 ACQUIRE_LOCK(&lk);
2332 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
2333 (void) free_inodedep(inodedep);
2334
2335 if(delay) {
2336 freeblks->fb_state |= DEPCOMPLETE;
2337 /*
2338 * If the inode with zeroed block pointers is now on disk
2339 * we can start freeing blocks. Add freeblks to the worklist
2340 * instead of calling handle_workitem_freeblocks directly as
2341 * it is more likely that additional IO is needed to complete
2342 * the request here than in the !delay case.
2343 */
2344 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2345 add_to_worklist(&freeblks->fb_list);
2346 }
2347
2348 FREE_LOCK(&lk);
2349 /*
2350 * If the inode has never been written to disk (delay == 0),
2351 * then we can process the freeblks now that we have deleted
2352 * the dependencies.
2353 */
2354 if (!delay)
2355 handle_workitem_freeblocks(freeblks, 0);
2356 }
2357
2358 /*
2359 * Reclaim any dependency structures from a buffer that is about to
2360 * be reallocated to a new vnode. The buffer must be locked, thus,
2361 * no I/O completion operations can occur while we are manipulating
2362 * its associated dependencies. The mutex is held so that other I/O's
2363 * associated with related dependencies do not occur.
2364 */
2365 static void
2366 deallocate_dependencies(bp, inodedep)
2367 struct buf *bp;
2368 struct inodedep *inodedep;
2369 {
2370 struct worklist *wk;
2371 struct indirdep *indirdep;
2372 struct allocindir *aip;
2373 struct pagedep *pagedep;
2374 struct dirrem *dirrem;
2375 struct diradd *dap;
2376 int i;
2377
2378 mtx_assert(&lk, MA_OWNED);
2379 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2380 switch (wk->wk_type) {
2381
2382 case D_INDIRDEP:
2383 indirdep = WK_INDIRDEP(wk);
2384 /*
2385 * None of the indirect pointers will ever be visible,
2386 * so they can simply be tossed. GOINGAWAY ensures
2387 * that allocated pointers will be saved in the buffer
2388 * cache until they are freed. Note that they will
2389 * only be able to be found by their physical address
2390 * since the inode mapping the logical address will
2391 * be gone. The save buffer used for the safe copy
2392 * was allocated in setup_allocindir_phase2 using
2393 * the physical address so it could be used for this
2394 * purpose. Hence we swap the safe copy with the real
2395 * copy, allowing the safe copy to be freed and holding
2396 * on to the real copy for later use in indir_trunc.
2397 */
2398 if (indirdep->ir_state & GOINGAWAY)
2399 panic("deallocate_dependencies: already gone");
2400 indirdep->ir_state |= GOINGAWAY;
2401 VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
2402 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2403 free_allocindir(aip, inodedep);
2404 if (bp->b_lblkno >= 0 ||
2405 bp->b_blkno != indirdep->ir_savebp->b_lblkno)
2406 panic("deallocate_dependencies: not indir");
2407 bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2408 bp->b_bcount);
2409 WORKLIST_REMOVE(wk);
2410 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2411 continue;
2412
2413 case D_PAGEDEP:
2414 pagedep = WK_PAGEDEP(wk);
2415 /*
2416 * None of the directory additions will ever be
2417 * visible, so they can simply be tossed.
2418 */
2419 for (i = 0; i < DAHASHSZ; i++)
2420 while ((dap =
2421 LIST_FIRST(&pagedep->pd_diraddhd[i])))
2422 free_diradd(dap);
2423 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2424 free_diradd(dap);
2425 /*
2426 * Copy any directory remove dependencies to the list
2427 * to be processed after the zero'ed inode is written.
2428 * If the inode has already been written, then they
2429 * can be dumped directly onto the work list.
2430 */
2431 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
2432 LIST_REMOVE(dirrem, dm_next);
2433 dirrem->dm_dirinum = pagedep->pd_ino;
2434 if (inodedep == NULL ||
2435 (inodedep->id_state & ALLCOMPLETE) ==
2436 ALLCOMPLETE)
2437 add_to_worklist(&dirrem->dm_list);
2438 else
2439 WORKLIST_INSERT(&inodedep->id_bufwait,
2440 &dirrem->dm_list);
2441 }
2442 if ((pagedep->pd_state & NEWBLOCK) != 0) {
2443 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2444 if (wk->wk_type == D_NEWDIRBLK &&
2445 WK_NEWDIRBLK(wk)->db_pagedep ==
2446 pagedep)
2447 break;
2448 if (wk != NULL) {
2449 WORKLIST_REMOVE(wk);
2450 free_newdirblk(WK_NEWDIRBLK(wk));
2451 } else
2452 panic("deallocate_dependencies: "
2453 "lost pagedep");
2454 }
2455 WORKLIST_REMOVE(&pagedep->pd_list);
2456 LIST_REMOVE(pagedep, pd_hash);
2457 WORKITEM_FREE(pagedep, D_PAGEDEP);
2458 continue;
2459
2460 case D_ALLOCINDIR:
2461 free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2462 continue;
2463
2464 case D_ALLOCDIRECT:
2465 case D_INODEDEP:
2466 panic("deallocate_dependencies: Unexpected type %s",
2467 TYPENAME(wk->wk_type));
2468 /* NOTREACHED */
2469
2470 default:
2471 panic("deallocate_dependencies: Unknown type %s",
2472 TYPENAME(wk->wk_type));
2473 /* NOTREACHED */
2474 }
2475 }
2476 }
2477
2478 /*
2479 * Free an allocdirect. Generate a new freefrag work request if appropriate.
2480 * This routine must be called with splbio interrupts blocked.
2481 */
2482 static void
2483 free_allocdirect(adphead, adp, delay)
2484 struct allocdirectlst *adphead;
2485 struct allocdirect *adp;
2486 int delay;
2487 {
2488 struct newdirblk *newdirblk;
2489 struct worklist *wk;
2490
2491 mtx_assert(&lk, MA_OWNED);
2492 if ((adp->ad_state & DEPCOMPLETE) == 0)
2493 LIST_REMOVE(adp, ad_deps);
2494 TAILQ_REMOVE(adphead, adp, ad_next);
2495 if ((adp->ad_state & COMPLETE) == 0)
2496 WORKLIST_REMOVE(&adp->ad_list);
2497 if (adp->ad_freefrag != NULL) {
2498 if (delay)
2499 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2500 &adp->ad_freefrag->ff_list);
2501 else
2502 add_to_worklist(&adp->ad_freefrag->ff_list);
2503 }
2504 if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2505 newdirblk = WK_NEWDIRBLK(wk);
2506 WORKLIST_REMOVE(&newdirblk->db_list);
2507 if (!LIST_EMPTY(&adp->ad_newdirblk))
2508 panic("free_allocdirect: extra newdirblk");
2509 if (delay)
2510 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2511 &newdirblk->db_list);
2512 else
2513 free_newdirblk(newdirblk);
2514 }
2515 WORKITEM_FREE(adp, D_ALLOCDIRECT);
2516 }
2517
2518 /*
2519 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2520 * This routine must be called with splbio interrupts blocked.
2521 */
2522 static void
2523 free_newdirblk(newdirblk)
2524 struct newdirblk *newdirblk;
2525 {
2526 struct pagedep *pagedep;
2527 struct diradd *dap;
2528 int i;
2529
2530 mtx_assert(&lk, MA_OWNED);
2531 /*
2532 * If the pagedep is still linked onto the directory buffer
2533 * dependency chain, then some of the entries on the
2534 * pd_pendinghd list may not be committed to disk yet. In
2535 * this case, we will simply clear the NEWBLOCK flag and
2536 * let the pd_pendinghd list be processed when the pagedep
2537 * is next written. If the pagedep is no longer on the buffer
2538 * dependency chain, then all the entries on the pd_pending
2539 * list are committed to disk and we can free them here.
2540 */
2541 pagedep = newdirblk->db_pagedep;
2542 pagedep->pd_state &= ~NEWBLOCK;
2543 if ((pagedep->pd_state & ONWORKLIST) == 0)
2544 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2545 free_diradd(dap);
2546 /*
2547 * If no dependencies remain, the pagedep will be freed.
2548 */
2549 for (i = 0; i < DAHASHSZ; i++)
2550 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
2551 break;
2552 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2553 LIST_REMOVE(pagedep, pd_hash);
2554 WORKITEM_FREE(pagedep, D_PAGEDEP);
2555 }
2556 WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2557 }
2558
2559 /*
2560 * Prepare an inode to be freed. The actual free operation is not
2561 * done until the zero'ed inode has been written to disk.
2562 */
2563 void
2564 softdep_freefile(pvp, ino, mode)
2565 struct vnode *pvp;
2566 ino_t ino;
2567 int mode;
2568 {
2569 struct inode *ip = VTOI(pvp);
2570 struct inodedep *inodedep;
2571 struct freefile *freefile;
2572
2573 /*
2574 * This sets up the inode de-allocation dependency.
2575 */
2576 MALLOC(freefile, struct freefile *, sizeof(struct freefile),
2577 M_FREEFILE, M_SOFTDEP_FLAGS);
2578 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
2579 freefile->fx_mode = mode;
2580 freefile->fx_oldinum = ino;
2581 freefile->fx_devvp = ip->i_devvp;
2582 if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
2583 UFS_LOCK(ip->i_ump);
2584 ip->i_fs->fs_pendinginodes += 1;
2585 UFS_UNLOCK(ip->i_ump);
2586 }
2587
2588 /*
2589 * If the inodedep does not exist, then the zero'ed inode has
2590 * been written to disk. If the allocated inode has never been
2591 * written to disk, then the on-disk inode is zero'ed. In either
2592 * case we can free the file immediately.
2593 */
2594 ACQUIRE_LOCK(&lk);
2595 if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
2596 check_inode_unwritten(inodedep)) {
2597 FREE_LOCK(&lk);
2598 handle_workitem_freefile(freefile);
2599 return;
2600 }
2601 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2602 FREE_LOCK(&lk);
2603 ip->i_flag |= IN_MODIFIED;
2604 }
2605
2606 /*
2607 * Check to see if an inode has never been written to disk. If
2608 * so free the inodedep and return success, otherwise return failure.
2609 * This routine must be called with splbio interrupts blocked.
2610 *
2611 * If we still have a bitmap dependency, then the inode has never
2612 * been written to disk. Drop the dependency as it is no longer
2613 * necessary since the inode is being deallocated. We set the
2614 * ALLCOMPLETE flags since the bitmap now properly shows that the
2615 * inode is not allocated. Even if the inode is actively being
2616 * written, it has been rolled back to its zero'ed state, so we
2617 * are ensured that a zero inode is what is on the disk. For short
2618 * lived files, this change will usually result in removing all the
2619 * dependencies from the inode so that it can be freed immediately.
2620 */
2621 static int
2622 check_inode_unwritten(inodedep)
2623 struct inodedep *inodedep;
2624 {
2625
2626 mtx_assert(&lk, MA_OWNED);
2627 if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2628 !LIST_EMPTY(&inodedep->id_pendinghd) ||
2629 !LIST_EMPTY(&inodedep->id_bufwait) ||
2630 !LIST_EMPTY(&inodedep->id_inowait) ||
2631 !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2632 !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2633 !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2634 !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2635 inodedep->id_nlinkdelta != 0)
2636 return (0);
2637
2638 /*
2639 * Another process might be in initiate_write_inodeblock_ufs[12]
2640 * trying to allocate memory without holding "Softdep Lock".
2641 */
2642 if ((inodedep->id_state & IOSTARTED) != 0 &&
2643 inodedep->id_savedino1 == NULL)
2644 return (0);
2645
2646 inodedep->id_state |= ALLCOMPLETE;
2647 LIST_REMOVE(inodedep, id_deps);
2648 inodedep->id_buf = NULL;
2649 if (inodedep->id_state & ONWORKLIST)
2650 WORKLIST_REMOVE(&inodedep->id_list);
2651 if (inodedep->id_savedino1 != NULL) {
2652 FREE(inodedep->id_savedino1, M_SAVEDINO);
2653 inodedep->id_savedino1 = NULL;
2654 }
2655 if (free_inodedep(inodedep) == 0)
2656 panic("check_inode_unwritten: busy inode");
2657 return (1);
2658 }
2659
2660 /*
2661 * Try to free an inodedep structure. Return 1 if it could be freed.
2662 */
2663 static int
2664 free_inodedep(inodedep)
2665 struct inodedep *inodedep;
2666 {
2667
2668 mtx_assert(&lk, MA_OWNED);
2669 if ((inodedep->id_state & ONWORKLIST) != 0 ||
2670 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2671 !LIST_EMPTY(&inodedep->id_pendinghd) ||
2672 !LIST_EMPTY(&inodedep->id_bufwait) ||
2673 !LIST_EMPTY(&inodedep->id_inowait) ||
2674 !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
2675 !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
2676 !TAILQ_EMPTY(&inodedep->id_extupdt) ||
2677 !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
2678 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2679 return (0);
2680 LIST_REMOVE(inodedep, id_hash);
2681 WORKITEM_FREE(inodedep, D_INODEDEP);
2682 num_inodedep -= 1;
2683 return (1);
2684 }
2685
2686 /*
2687 * This workitem routine performs the block de-allocation.
2688 * The workitem is added to the pending list after the updated
2689 * inode block has been written to disk. As mentioned above,
2690 * checks regarding the number of blocks de-allocated (compared
2691 * to the number of blocks allocated for the file) are also
2692 * performed in this function.
2693 */
2694 static void
2695 handle_workitem_freeblocks(freeblks, flags)
2696 struct freeblks *freeblks;
2697 int flags;
2698 {
2699 struct inode *ip;
2700 struct vnode *vp;
2701 struct fs *fs;
2702 struct ufsmount *ump;
2703 int i, nblocks, level, bsize;
2704 ufs2_daddr_t bn, blocksreleased = 0;
2705 int error, allerror = 0;
2706 ufs_lbn_t baselbns[NIADDR], tmpval;
2707 int fs_pendingblocks;
2708
2709 ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2710 fs = ump->um_fs;
2711 fs_pendingblocks = 0;
2712 tmpval = 1;
2713 baselbns[0] = NDADDR;
2714 for (i = 1; i < NIADDR; i++) {
2715 tmpval *= NINDIR(fs);
2716 baselbns[i] = baselbns[i - 1] + tmpval;
2717 }
2718 nblocks = btodb(fs->fs_bsize);
2719 blocksreleased = 0;
2720 /*
2721 * Release all extended attribute blocks or frags.
2722 */
2723 if (freeblks->fb_oldextsize > 0) {
2724 for (i = (NXADDR - 1); i >= 0; i--) {
2725 if ((bn = freeblks->fb_eblks[i]) == 0)
2726 continue;
2727 bsize = sblksize(fs, freeblks->fb_oldextsize, i);
2728 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2729 freeblks->fb_previousinum);
2730 blocksreleased += btodb(bsize);
2731 }
2732 }
2733 /*
2734 * Release all data blocks or frags.
2735 */
2736 if (freeblks->fb_oldsize > 0) {
2737 /*
2738 * Indirect blocks first.
2739 */
2740 for (level = (NIADDR - 1); level >= 0; level--) {
2741 if ((bn = freeblks->fb_iblks[level]) == 0)
2742 continue;
2743 if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
2744 level, baselbns[level], &blocksreleased)) != 0)
2745 allerror = error;
2746 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
2747 fs->fs_bsize, freeblks->fb_previousinum);
2748 fs_pendingblocks += nblocks;
2749 blocksreleased += nblocks;
2750 }
2751 /*
2752 * All direct blocks or frags.
2753 */
2754 for (i = (NDADDR - 1); i >= 0; i--) {
2755 if ((bn = freeblks->fb_dblks[i]) == 0)
2756 continue;
2757 bsize = sblksize(fs, freeblks->fb_oldsize, i);
2758 ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
2759 freeblks->fb_previousinum);
2760 fs_pendingblocks += btodb(bsize);
2761 blocksreleased += btodb(bsize);
2762 }
2763 }
2764 UFS_LOCK(ump);
2765 fs->fs_pendingblocks -= fs_pendingblocks;
2766 UFS_UNLOCK(ump);
2767 /*
2768 * If we still have not finished background cleanup, then check
2769 * to see if the block count needs to be adjusted.
2770 */
2771 if (freeblks->fb_chkcnt != blocksreleased &&
2772 (fs->fs_flags & FS_UNCLEAN) != 0 &&
2773 ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
2774 (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
2775 ip = VTOI(vp);
2776 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
2777 freeblks->fb_chkcnt - blocksreleased);
2778 ip->i_flag |= IN_CHANGE;
2779 vput(vp);
2780 }
2781
2782 #ifdef INVARIANTS
2783 if (freeblks->fb_chkcnt != blocksreleased &&
2784 ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
2785 printf("handle_workitem_freeblocks: block count\n");
2786 if (allerror)
2787 softdep_error("handle_workitem_freeblks", allerror);
2788 #endif /* INVARIANTS */
2789
2790 ACQUIRE_LOCK(&lk);
2791 WORKITEM_FREE(freeblks, D_FREEBLKS);
2792 num_freeblkdep--;
2793 FREE_LOCK(&lk);
2794 }
2795
2796 /*
2797 * Release blocks associated with the inode ip and stored in the indirect
2798 * block dbn. If level is greater than SINGLE, the block is an indirect block
2799 * and recursive calls to indirtrunc must be used to cleanse other indirect
2800 * blocks.
2801 */
2802 static int
2803 indir_trunc(freeblks, dbn, level, lbn, countp)
2804 struct freeblks *freeblks;
2805 ufs2_daddr_t dbn;
2806 int level;
2807 ufs_lbn_t lbn;
2808 ufs2_daddr_t *countp;
2809 {
2810 struct buf *bp;
2811 struct fs *fs;
2812 struct worklist *wk;
2813 struct indirdep *indirdep;
2814 struct ufsmount *ump;
2815 ufs1_daddr_t *bap1 = 0;
2816 ufs2_daddr_t nb, *bap2 = 0;
2817 ufs_lbn_t lbnadd;
2818 int i, nblocks, ufs1fmt;
2819 int error, allerror = 0;
2820 int fs_pendingblocks;
2821
2822 ump = VFSTOUFS(freeblks->fb_list.wk_mp);
2823 fs = ump->um_fs;
2824 fs_pendingblocks = 0;
2825 lbnadd = 1;
2826 for (i = level; i > 0; i--)
2827 lbnadd *= NINDIR(fs);
2828 /*
2829 * Get buffer of block pointers to be freed. This routine is not
2830 * called until the zero'ed inode has been written, so it is safe
2831 * to free blocks as they are encountered. Because the inode has
2832 * been zero'ed, calls to bmap on these blocks will fail. So, we
2833 * have to use the on-disk address and the block device for the
2834 * filesystem to look them up. If the file was deleted before its
2835 * indirect blocks were all written to disk, the routine that set
2836 * us up (deallocate_dependencies) will have arranged to leave
2837 * a complete copy of the indirect block in memory for our use.
2838 * Otherwise we have to read the blocks in from the disk.
2839 */
2840 #ifdef notyet
2841 bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
2842 GB_NOCREAT);
2843 #else
2844 bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
2845 #endif
2846 ACQUIRE_LOCK(&lk);
2847 if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2848 if (wk->wk_type != D_INDIRDEP ||
2849 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2850 (indirdep->ir_state & GOINGAWAY) == 0)
2851 panic("indir_trunc: lost indirdep");
2852 WORKLIST_REMOVE(wk);
2853 WORKITEM_FREE(indirdep, D_INDIRDEP);
2854 if (!LIST_EMPTY(&bp->b_dep))
2855 panic("indir_trunc: dangling dep");
2856 ump->um_numindirdeps -= 1;
2857 FREE_LOCK(&lk);
2858 } else {
2859 #ifdef notyet
2860 if (bp)
2861 brelse(bp);
2862 #endif
2863 FREE_LOCK(&lk);
2864 error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
2865 NOCRED, &bp);
2866 if (error) {
2867 brelse(bp);
2868 return (error);
2869 }
2870 }
2871 /*
2872 * Recursively free indirect blocks.
2873 */
2874 if (ump->um_fstype == UFS1) {
2875 ufs1fmt = 1;
2876 bap1 = (ufs1_daddr_t *)bp->b_data;
2877 } else {
2878 ufs1fmt = 0;
2879 bap2 = (ufs2_daddr_t *)bp->b_data;
2880 }
2881 nblocks = btodb(fs->fs_bsize);
2882 for (i = NINDIR(fs) - 1; i >= 0; i--) {
2883 if (ufs1fmt)
2884 nb = bap1[i];
2885 else
2886 nb = bap2[i];
2887 if (nb == 0)
2888 continue;
2889 if (level != 0) {
2890 if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
2891 level - 1, lbn + (i * lbnadd), countp)) != 0)
2892 allerror = error;
2893 }
2894 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
2895 freeblks->fb_previousinum);
2896 fs_pendingblocks += nblocks;
2897 *countp += nblocks;
2898 }
2899 UFS_LOCK(ump);
2900 fs->fs_pendingblocks -= fs_pendingblocks;
2901 UFS_UNLOCK(ump);
2902 bp->b_flags |= B_INVAL | B_NOCACHE;
2903 brelse(bp);
2904 return (allerror);
2905 }
2906
2907 /*
2908 * Free an allocindir.
2909 * This routine must be called with splbio interrupts blocked.
2910 */
2911 static void
2912 free_allocindir(aip, inodedep)
2913 struct allocindir *aip;
2914 struct inodedep *inodedep;
2915 {
2916 struct freefrag *freefrag;
2917
2918 mtx_assert(&lk, MA_OWNED);
2919 if ((aip->ai_state & DEPCOMPLETE) == 0)
2920 LIST_REMOVE(aip, ai_deps);
2921 if (aip->ai_state & ONWORKLIST)
2922 WORKLIST_REMOVE(&aip->ai_list);
2923 LIST_REMOVE(aip, ai_next);
2924 if ((freefrag = aip->ai_freefrag) != NULL) {
2925 if (inodedep == NULL)
2926 add_to_worklist(&freefrag->ff_list);
2927 else
2928 WORKLIST_INSERT(&inodedep->id_bufwait,
2929 &freefrag->ff_list);
2930 }
2931 WORKITEM_FREE(aip, D_ALLOCINDIR);
2932 }
2933
2934 /*
2935 * Directory entry addition dependencies.
2936 *
2937 * When adding a new directory entry, the inode (with its incremented link
2938 * count) must be written to disk before the directory entry's pointer to it.
2939 * Also, if the inode is newly allocated, the corresponding freemap must be
2940 * updated (on disk) before the directory entry's pointer. These requirements
2941 * are met via undo/redo on the directory entry's pointer, which consists
2942 * simply of the inode number.
2943 *
2944 * As directory entries are added and deleted, the free space within a
2945 * directory block can become fragmented. The ufs filesystem will compact
2946 * a fragmented directory block to make space for a new entry. When this
2947 * occurs, the offsets of previously added entries change. Any "diradd"
2948 * dependency structures corresponding to these entries must be updated with
2949 * the new offsets.
2950 */
2951
2952 /*
2953 * This routine is called after the in-memory inode's link
2954 * count has been incremented, but before the directory entry's
2955 * pointer to the inode has been set.
2956 */
2957 int
2958 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2959 struct buf *bp; /* buffer containing directory block */
2960 struct inode *dp; /* inode for directory */
2961 off_t diroffset; /* offset of new entry in directory */
2962 ino_t newinum; /* inode referenced by new directory entry */
2963 struct buf *newdirbp; /* non-NULL => contents of new mkdir */
2964 int isnewblk; /* entry is in a newly allocated block */
2965 {
2966 int offset; /* offset of new entry within directory block */
2967 ufs_lbn_t lbn; /* block in directory containing new entry */
2968 struct fs *fs;
2969 struct diradd *dap;
2970 struct allocdirect *adp;
2971 struct pagedep *pagedep;
2972 struct inodedep *inodedep;
2973 struct newdirblk *newdirblk = 0;
2974 struct mkdir *mkdir1, *mkdir2;
2975 struct mount *mp;
2976
2977 /*
2978 * Whiteouts have no dependencies.
2979 */
2980 if (newinum == WINO) {
2981 if (newdirbp != NULL)
2982 bdwrite(newdirbp);
2983 return (0);
2984 }
2985 mp = UFSTOVFS(dp->i_ump);
2986 fs = dp->i_fs;
2987 lbn = lblkno(fs, diroffset);
2988 offset = blkoff(fs, diroffset);
2989 MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
2990 M_SOFTDEP_FLAGS|M_ZERO);
2991 workitem_alloc(&dap->da_list, D_DIRADD, mp);
2992 dap->da_offset = offset;
2993 dap->da_newinum = newinum;
2994 dap->da_state = ATTACHED;
2995 if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2996 MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
2997 M_NEWDIRBLK, M_SOFTDEP_FLAGS);
2998 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
2999 }
3000 if (newdirbp == NULL) {
3001 dap->da_state |= DEPCOMPLETE;
3002 ACQUIRE_LOCK(&lk);
3003 } else {
3004 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
3005 MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3006 M_SOFTDEP_FLAGS);
3007 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
3008 mkdir1->md_state = MKDIR_BODY;
3009 mkdir1->md_diradd = dap;
3010 MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
3011 M_SOFTDEP_FLAGS);
3012 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
3013 mkdir2->md_state = MKDIR_PARENT;
3014 mkdir2->md_diradd = dap;
3015 /*
3016 * Dependency on "." and ".." being written to disk.
3017 */
3018 mkdir1->md_buf = newdirbp;
3019 ACQUIRE_LOCK(&lk);
3020 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
3021 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
3022 FREE_LOCK(&lk);
3023 bdwrite(newdirbp);
3024 /*
3025 * Dependency on link count increase for parent directory
3026 */
3027 ACQUIRE_LOCK(&lk);
3028 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
3029 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3030 dap->da_state &= ~MKDIR_PARENT;
3031 WORKITEM_FREE(mkdir2, D_MKDIR);
3032 } else {
3033 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
3034 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
3035 }
3036 }
3037 /*
3038 * Link into parent directory pagedep to await its being written.
3039 */
3040 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3041 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3042 dap->da_pagedep = pagedep;
3043 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
3044 da_pdlist);
3045 /*
3046 * Link into its inodedep. Put it on the id_bufwait list if the inode
3047 * is not yet written. If it is written, do the post-inode write
3048 * processing to put it on the id_pendinghd list.
3049 */
3050 (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
3051 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
3052 diradd_inode_written(dap, inodedep);
3053 else
3054 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3055 if (isnewblk) {
3056 /*
3057 * Directories growing into indirect blocks are rare
3058 * enough and the frequency of new block allocation
3059 * in those cases even more rare, that we choose not
3060 * to bother tracking them. Rather we simply force the
3061 * new directory entry to disk.
3062 */
3063 if (lbn >= NDADDR) {
3064 FREE_LOCK(&lk);
3065 /*
3066 * We only have a new allocation when at the
3067 * beginning of a new block, not when we are
3068 * expanding into an existing block.
3069 */
3070 if (blkoff(fs, diroffset) == 0)
3071 return (1);
3072 return (0);
3073 }
3074 /*
3075 * We only have a new allocation when at the beginning
3076 * of a new fragment, not when we are expanding into an
3077 * existing fragment. Also, there is nothing to do if we
3078 * are already tracking this block.
3079 */
3080 if (fragoff(fs, diroffset) != 0) {
3081 FREE_LOCK(&lk);
3082 return (0);
3083 }
3084 if ((pagedep->pd_state & NEWBLOCK) != 0) {
3085 WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
3086 FREE_LOCK(&lk);
3087 return (0);
3088 }
3089 /*
3090 * Find our associated allocdirect and have it track us.
3091 */
3092 if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
3093 panic("softdep_setup_directory_add: lost inodedep");
3094 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
3095 if (adp == NULL || adp->ad_lbn != lbn)
3096 panic("softdep_setup_directory_add: lost entry");
3097 pagedep->pd_state |= NEWBLOCK;
3098 newdirblk->db_pagedep = pagedep;
3099 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
3100 }
3101 FREE_LOCK(&lk);
3102 return (0);
3103 }
3104
3105 /*
3106 * This procedure is called to change the offset of a directory
3107 * entry when compacting a directory block which must be owned
3108 * exclusively by the caller. Note that the actual entry movement
3109 * must be done in this procedure to ensure that no I/O completions
3110 * occur while the move is in progress.
3111 */
3112 void
3113 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
3114 struct inode *dp; /* inode for directory */
3115 caddr_t base; /* address of dp->i_offset */
3116 caddr_t oldloc; /* address of old directory location */
3117 caddr_t newloc; /* address of new directory location */
3118 int entrysize; /* size of directory entry */
3119 {
3120 int offset, oldoffset, newoffset;
3121 struct pagedep *pagedep;
3122 struct diradd *dap;
3123 ufs_lbn_t lbn;
3124
3125 ACQUIRE_LOCK(&lk);
3126 lbn = lblkno(dp->i_fs, dp->i_offset);
3127 offset = blkoff(dp->i_fs, dp->i_offset);
3128 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
3129 goto done;
3130 oldoffset = offset + (oldloc - base);
3131 newoffset = offset + (newloc - base);
3132
3133 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
3134 if (dap->da_offset != oldoffset)
3135 continue;
3136 dap->da_offset = newoffset;
3137 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
3138 break;
3139 LIST_REMOVE(dap, da_pdlist);
3140 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
3141 dap, da_pdlist);
3142 break;
3143 }
3144 if (dap == NULL) {
3145
3146 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
3147 if (dap->da_offset == oldoffset) {
3148 dap->da_offset = newoffset;
3149 break;
3150 }
3151 }
3152 }
3153 done:
3154 bcopy(oldloc, newloc, entrysize);
3155 FREE_LOCK(&lk);
3156 }
3157
3158 /*
3159 * Free a diradd dependency structure. This routine must be called
3160 * with splbio interrupts blocked.
3161 */
3162 static void
3163 free_diradd(dap)
3164 struct diradd *dap;
3165 {
3166 struct dirrem *dirrem;
3167 struct pagedep *pagedep;
3168 struct inodedep *inodedep;
3169 struct mkdir *mkdir, *nextmd;
3170
3171 mtx_assert(&lk, MA_OWNED);
3172 WORKLIST_REMOVE(&dap->da_list);
3173 LIST_REMOVE(dap, da_pdlist);
3174 if ((dap->da_state & DIRCHG) == 0) {
3175 pagedep = dap->da_pagedep;
3176 } else {
3177 dirrem = dap->da_previous;
3178 pagedep = dirrem->dm_pagedep;
3179 dirrem->dm_dirinum = pagedep->pd_ino;
3180 add_to_worklist(&dirrem->dm_list);
3181 }
3182 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
3183 0, &inodedep) != 0)
3184 (void) free_inodedep(inodedep);
3185 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
3186 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
3187 nextmd = LIST_NEXT(mkdir, md_mkdirs);
3188 if (mkdir->md_diradd != dap)
3189 continue;
3190 dap->da_state &= ~mkdir->md_state;
3191 WORKLIST_REMOVE(&mkdir->md_list);
3192 LIST_REMOVE(mkdir, md_mkdirs);
3193 WORKITEM_FREE(mkdir, D_MKDIR);
3194 }
3195 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
3196 panic("free_diradd: unfound ref");
3197 }
3198 WORKITEM_FREE(dap, D_DIRADD);
3199 }
3200
3201 /*
3202 * Directory entry removal dependencies.
3203 *
3204 * When removing a directory entry, the entry's inode pointer must be
3205 * zero'ed on disk before the corresponding inode's link count is decremented
3206 * (possibly freeing the inode for re-use). This dependency is handled by
3207 * updating the directory entry but delaying the inode count reduction until
3208 * after the directory block has been written to disk. After this point, the
3209 * inode count can be decremented whenever it is convenient.
3210 */
3211
3212 /*
3213 * This routine should be called immediately after removing
3214 * a directory entry. The inode's link count should not be
3215 * decremented by the calling procedure -- the soft updates
3216 * code will do this task when it is safe.
3217 */
3218 void
3219 softdep_setup_remove(bp, dp, ip, isrmdir)
3220 struct buf *bp; /* buffer containing directory block */
3221 struct inode *dp; /* inode for the directory being modified */
3222 struct inode *ip; /* inode for directory entry being removed */
3223 int isrmdir; /* indicates if doing RMDIR */
3224 {
3225 struct dirrem *dirrem, *prevdirrem;
3226
3227 /*
3228 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
3229 */
3230 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3231
3232 /*
3233 * If the COMPLETE flag is clear, then there were no active
3234 * entries and we want to roll back to a zeroed entry until
3235 * the new inode is committed to disk. If the COMPLETE flag is
3236 * set then we have deleted an entry that never made it to
3237 * disk. If the entry we deleted resulted from a name change,
3238 * then the old name still resides on disk. We cannot delete
3239 * its inode (returned to us in prevdirrem) until the zeroed
3240 * directory entry gets to disk. The new inode has never been
3241 * referenced on the disk, so can be deleted immediately.
3242 */
3243 if ((dirrem->dm_state & COMPLETE) == 0) {
3244 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
3245 dm_next);
3246 FREE_LOCK(&lk);
3247 } else {
3248 if (prevdirrem != NULL)
3249 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
3250 prevdirrem, dm_next);
3251 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
3252 FREE_LOCK(&lk);
3253 handle_workitem_remove(dirrem, NULL);
3254 }
3255 }
3256
3257 /*
3258 * Allocate a new dirrem if appropriate and return it along with
3259 * its associated pagedep. Called without a lock, returns with lock.
3260 */
3261 static long num_dirrem; /* number of dirrem allocated */
3262 static struct dirrem *
3263 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
3264 struct buf *bp; /* buffer containing directory block */
3265 struct inode *dp; /* inode for the directory being modified */
3266 struct inode *ip; /* inode for directory entry being removed */
3267 int isrmdir; /* indicates if doing RMDIR */
3268 struct dirrem **prevdirremp; /* previously referenced inode, if any */
3269 {
3270 int offset;
3271 ufs_lbn_t lbn;
3272 struct diradd *dap;
3273 struct dirrem *dirrem;
3274 struct pagedep *pagedep;
3275
3276 /*
3277 * Whiteouts have no deletion dependencies.
3278 */
3279 if (ip == NULL)
3280 panic("newdirrem: whiteout");
3281 /*
3282 * If we are over our limit, try to improve the situation.
3283 * Limiting the number of dirrem structures will also limit
3284 * the number of freefile and freeblks structures.
3285 */
3286 ACQUIRE_LOCK(&lk);
3287 if (num_dirrem > max_softdeps / 2)
3288 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
3289 num_dirrem += 1;
3290 FREE_LOCK(&lk);
3291 MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
3292 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
3293 workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
3294 dirrem->dm_state = isrmdir ? RMDIR : 0;
3295 dirrem->dm_oldinum = ip->i_number;
3296 *prevdirremp = NULL;
3297
3298 ACQUIRE_LOCK(&lk);
3299 lbn = lblkno(dp->i_fs, dp->i_offset);
3300 offset = blkoff(dp->i_fs, dp->i_offset);
3301 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
3302 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
3303 dirrem->dm_pagedep = pagedep;
3304 /*
3305 * Check for a diradd dependency for the same directory entry.
3306 * If present, then both dependencies become obsolete and can
3307 * be de-allocated. Check for an entry on both the pd_dirraddhd
3308 * list and the pd_pendinghd list.
3309 */
3310
3311 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
3312 if (dap->da_offset == offset)
3313 break;
3314 if (dap == NULL) {
3315
3316 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
3317 if (dap->da_offset == offset)
3318 break;
3319 if (dap == NULL)
3320 return (dirrem);
3321 }
3322 /*
3323 * Must be ATTACHED at this point.
3324 */
3325 if ((dap->da_state & ATTACHED) == 0)
3326 panic("newdirrem: not ATTACHED");
3327 if (dap->da_newinum != ip->i_number)
3328 panic("newdirrem: inum %d should be %d",
3329 ip->i_number, dap->da_newinum);
3330 /*
3331 * If we are deleting a changed name that never made it to disk,
3332 * then return the dirrem describing the previous inode (which
3333 * represents the inode currently referenced from this entry on disk).
3334 */
3335 if ((dap->da_state & DIRCHG) != 0) {
3336 *prevdirremp = dap->da_previous;
3337 dap->da_state &= ~DIRCHG;
3338 dap->da_pagedep = pagedep;
3339 }
3340 /*
3341 * We are deleting an entry that never made it to disk.
3342 * Mark it COMPLETE so we can delete its inode immediately.
3343 */
3344 dirrem->dm_state |= COMPLETE;
3345 free_diradd(dap);
3346 return (dirrem);
3347 }
3348
3349 /*
3350 * Directory entry change dependencies.
3351 *
3352 * Changing an existing directory entry requires that an add operation
3353 * be completed first followed by a deletion. The semantics for the addition
3354 * are identical to the description of adding a new entry above except
3355 * that the rollback is to the old inode number rather than zero. Once
3356 * the addition dependency is completed, the removal is done as described
3357 * in the removal routine above.
3358 */
3359
3360 /*
3361 * This routine should be called immediately after changing
3362 * a directory entry. The inode's link count should not be
3363 * decremented by the calling procedure -- the soft updates
3364 * code will perform this task when it is safe.
3365 */
3366 void
3367 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3368 struct buf *bp; /* buffer containing directory block */
3369 struct inode *dp; /* inode for the directory being modified */
3370 struct inode *ip; /* inode for directory entry being removed */
3371 ino_t newinum; /* new inode number for changed entry */
3372 int isrmdir; /* indicates if doing RMDIR */
3373 {
3374 int offset;
3375 struct diradd *dap = NULL;
3376 struct dirrem *dirrem, *prevdirrem;
3377 struct pagedep *pagedep;
3378 struct inodedep *inodedep;
3379 struct mount *mp;
3380
3381 offset = blkoff(dp->i_fs, dp->i_offset);
3382 mp = UFSTOVFS(dp->i_ump);
3383
3384 /*
3385 * Whiteouts do not need diradd dependencies.
3386 */
3387 if (newinum != WINO) {
3388 MALLOC(dap, struct diradd *, sizeof(struct diradd),
3389 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
3390 workitem_alloc(&dap->da_list, D_DIRADD, mp);
3391 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3392 dap->da_offset = offset;
3393 dap->da_newinum = newinum;
3394 }
3395
3396 /*
3397 * Allocate a new dirrem and ACQUIRE_LOCK.
3398 */
3399 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3400 pagedep = dirrem->dm_pagedep;
3401 /*
3402 * The possible values for isrmdir:
3403 * 0 - non-directory file rename
3404 * 1 - directory rename within same directory
3405 * inum - directory rename to new directory of given inode number
3406 * When renaming to a new directory, we are both deleting and
3407 * creating a new directory entry, so the link count on the new
3408 * directory should not change. Thus we do not need the followup
3409 * dirrem which is usually done in handle_workitem_remove. We set
3410 * the DIRCHG flag to tell handle_workitem_remove to skip the
3411 * followup dirrem.
3412 */
3413 if (isrmdir > 1)
3414 dirrem->dm_state |= DIRCHG;
3415
3416 /*
3417 * Whiteouts have no additional dependencies,
3418 * so just put the dirrem on the correct list.
3419 */
3420 if (newinum == WINO) {
3421 if ((dirrem->dm_state & COMPLETE) == 0) {
3422 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
3423 dm_next);
3424 } else {
3425 dirrem->dm_dirinum = pagedep->pd_ino;
3426 add_to_worklist(&dirrem->dm_list);
3427 }
3428 FREE_LOCK(&lk);
3429 return;
3430 }
3431
3432 /*
3433 * If the COMPLETE flag is clear, then there were no active
3434 * entries and we want to roll back to the previous inode until
3435 * the new inode is committed to disk. If the COMPLETE flag is
3436 * set, then we have deleted an entry that never made it to disk.
3437 * If the entry we deleted resulted from a name change, then the old
3438 * inode reference still resides on disk. Any rollback that we do
3439 * needs to be to that old inode (returned to us in prevdirrem). If
3440 * the entry we deleted resulted from a create, then there is
3441 * no entry on the disk, so we want to roll back to zero rather
3442 * than the uncommitted inode. In either of the COMPLETE cases we
3443 * want to immediately free the unwritten and unreferenced inode.
3444 */
3445 if ((dirrem->dm_state & COMPLETE) == 0) {
3446 dap->da_previous = dirrem;
3447 } else {
3448 if (prevdirrem != NULL) {
3449 dap->da_previous = prevdirrem;
3450 } else {
3451 dap->da_state &= ~DIRCHG;
3452 dap->da_pagedep = pagedep;
3453 }
3454 dirrem->dm_dirinum = pagedep->pd_ino;
3455 add_to_worklist(&dirrem->dm_list);
3456 }
3457 /*
3458 * Link into its inodedep. Put it on the id_bufwait list if the inode
3459 * is not yet written. If it is written, do the post-inode write
3460 * processing to put it on the id_pendinghd list.
3461 */
3462 if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
3463 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3464 dap->da_state |= COMPLETE;
3465 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3466 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3467 } else {
3468 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3469 dap, da_pdlist);
3470 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3471 }
3472 FREE_LOCK(&lk);
3473 }
3474
3475 /*
3476 * Called whenever the link count on an inode is changed.
3477 * It creates an inode dependency so that the new reference(s)
3478 * to the inode cannot be committed to disk until the updated
3479 * inode has been written.
3480 */
3481 void
3482 softdep_change_linkcnt(ip)
3483 struct inode *ip; /* the inode with the increased link count */
3484 {
3485 struct inodedep *inodedep;
3486
3487 ACQUIRE_LOCK(&lk);
3488 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
3489 DEPALLOC, &inodedep);
3490 if (ip->i_nlink < ip->i_effnlink)
3491 panic("softdep_change_linkcnt: bad delta");
3492 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3493 FREE_LOCK(&lk);
3494 }
3495
3496 /*
3497 * Called when the effective link count and the reference count
3498 * on an inode drops to zero. At this point there are no names
3499 * referencing the file in the filesystem and no active file
3500 * references. The space associated with the file will be freed
3501 * as soon as the necessary soft dependencies are cleared.
3502 */
3503 void
3504 softdep_releasefile(ip)
3505 struct inode *ip; /* inode with the zero effective link count */
3506 {
3507 struct inodedep *inodedep;
3508 struct fs *fs;
3509 int extblocks;
3510
3511 if (ip->i_effnlink > 0)
3512 panic("softdep_releasefile: file still referenced");
3513 /*
3514 * We may be called several times as the on-disk link count
3515 * drops to zero. We only want to account for the space once.
3516 */
3517 if (ip->i_flag & IN_SPACECOUNTED)
3518 return;
3519 /*
3520 * We have to deactivate a snapshot otherwise copyonwrites may
3521 * add blocks and the cleanup may remove blocks after we have
3522 * tried to account for them.
3523 */
3524 if ((ip->i_flags & SF_SNAPSHOT) != 0)
3525 ffs_snapremove(ITOV(ip));
3526 /*
3527 * If we are tracking an nlinkdelta, we have to also remember
3528 * whether we accounted for the freed space yet.
3529 */
3530 ACQUIRE_LOCK(&lk);
3531 if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
3532 inodedep->id_state |= SPACECOUNTED;
3533 FREE_LOCK(&lk);
3534 fs = ip->i_fs;
3535 extblocks = 0;
3536 if (fs->fs_magic == FS_UFS2_MAGIC)
3537 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
3538 UFS_LOCK(ip->i_ump);
3539 ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
3540 ip->i_fs->fs_pendinginodes += 1;
3541 UFS_UNLOCK(ip->i_ump);
3542 ip->i_flag |= IN_SPACECOUNTED;
3543 }
3544
3545 /*
3546 * This workitem decrements the inode's link count.
3547 * If the link count reaches zero, the file is removed.
3548 */
3549 static void
3550 handle_workitem_remove(dirrem, xp)
3551 struct dirrem *dirrem;
3552 struct vnode *xp;
3553 {
3554 struct thread *td = curthread;
3555 struct inodedep *inodedep;
3556 struct vnode *vp;
3557 struct inode *ip;
3558 ino_t oldinum;
3559 int error;
3560
3561 if ((vp = xp) == NULL &&
3562 (error = ffs_vget(dirrem->dm_list.wk_mp,
3563 dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
3564 softdep_error("handle_workitem_remove: vget", error);
3565 return;
3566 }
3567 ip = VTOI(vp);
3568 ACQUIRE_LOCK(&lk);
3569 if ((inodedep_lookup(dirrem->dm_list.wk_mp,
3570 dirrem->dm_oldinum, 0, &inodedep)) == 0)
3571 panic("handle_workitem_remove: lost inodedep");
3572 /*
3573 * Normal file deletion.
3574 */
3575 if ((dirrem->dm_state & RMDIR) == 0) {
3576 ip->i_nlink--;
3577 DIP_SET(ip, i_nlink, ip->i_nlink);
3578 ip->i_flag |= IN_CHANGE;
3579 if (ip->i_nlink < ip->i_effnlink)
3580 panic("handle_workitem_remove: bad file delta");
3581 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3582 num_dirrem -= 1;
3583 WORKITEM_FREE(dirrem, D_DIRREM);
3584 FREE_LOCK(&lk);
3585 vput(vp);
3586 return;
3587 }
3588 /*
3589 * Directory deletion. Decrement reference count for both the
3590 * just deleted parent directory entry and the reference for ".".
3591 * Next truncate the directory to length zero. When the
3592 * truncation completes, arrange to have the reference count on
3593 * the parent decremented to account for the loss of "..".
3594 */
3595 ip->i_nlink -= 2;
3596 DIP_SET(ip, i_nlink, ip->i_nlink);
3597 ip->i_flag |= IN_CHANGE;
3598 if (ip->i_nlink < ip->i_effnlink)
3599 panic("handle_workitem_remove: bad dir delta");
3600 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
3601 FREE_LOCK(&lk);
3602 if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
3603 softdep_error("handle_workitem_remove: truncate", error);
3604 ACQUIRE_LOCK(&lk);
3605 /*
3606 * Rename a directory to a new parent. Since, we are both deleting
3607 * and creating a new directory entry, the link count on the new
3608 * directory should not change. Thus we skip the followup dirrem.
3609 */
3610 if (dirrem->dm_state & DIRCHG) {
3611 num_dirrem -= 1;
3612 WORKITEM_FREE(dirrem, D_DIRREM);
3613 FREE_LOCK(&lk);
3614 vput(vp);
3615 return;
3616 }
3617 /*
3618 * If the inodedep does not exist, then the zero'ed inode has
3619 * been written to disk. If the allocated inode has never been
3620 * written to disk, then the on-disk inode is zero'ed. In either
3621 * case we can remove the file immediately.
3622 */
3623 dirrem->dm_state = 0;
3624 oldinum = dirrem->dm_oldinum;
3625 dirrem->dm_oldinum = dirrem->dm_dirinum;
3626 if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
3627 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
3628 if (xp != NULL)
3629 add_to_worklist(&dirrem->dm_list);
3630 FREE_LOCK(&lk);
3631 vput(vp);
3632 if (xp == NULL)
3633 handle_workitem_remove(dirrem, NULL);
3634 return;
3635 }
3636 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3637 FREE_LOCK(&lk);
3638 ip->i_flag |= IN_CHANGE;
3639 ffs_update(vp, 0);
3640 vput(vp);
3641 }
3642
3643 /*
3644 * Inode de-allocation dependencies.
3645 *
3646 * When an inode's link count is reduced to zero, it can be de-allocated. We
3647 * found it convenient to postpone de-allocation until after the inode is
3648 * written to disk with its new link count (zero). At this point, all of the
3649 * on-disk inode's block pointers are nullified and, with careful dependency
3650 * list ordering, all dependencies related to the inode will be satisfied and
3651 * the corresponding dependency structures de-allocated. So, if/when the
3652 * inode is reused, there will be no mixing of old dependencies with new
3653 * ones. This artificial dependency is set up by the block de-allocation
3654 * procedure above (softdep_setup_freeblocks) and completed by the
3655 * following procedure.
3656 */
3657 static void
3658 handle_workitem_freefile(freefile)
3659 struct freefile *freefile;
3660 {
3661 struct fs *fs;
3662 struct inodedep *idp;
3663 struct ufsmount *ump;
3664 int error;
3665
3666 ump = VFSTOUFS(freefile->fx_list.wk_mp);
3667 fs = ump->um_fs;
3668 #ifdef DEBUG
3669 ACQUIRE_LOCK(&lk);
3670 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
3671 FREE_LOCK(&lk);
3672 if (error)
3673 panic("handle_workitem_freefile: inodedep survived");
3674 #endif
3675 UFS_LOCK(ump);
3676 fs->fs_pendinginodes -= 1;
3677 UFS_UNLOCK(ump);
3678 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
3679 freefile->fx_oldinum, freefile->fx_mode)) != 0)
3680 softdep_error("handle_workitem_freefile", error);
3681 ACQUIRE_LOCK(&lk);
3682 WORKITEM_FREE(freefile, D_FREEFILE);
3683 FREE_LOCK(&lk);
3684 }
3685
3686
3687 /*
3688 * Helper function which unlinks marker element from work list and returns
3689 * the next element on the list.
3690 */
3691 static __inline struct worklist *
3692 markernext(struct worklist *marker)
3693 {
3694 struct worklist *next;
3695
3696 next = LIST_NEXT(marker, wk_list);
3697 LIST_REMOVE(marker, wk_list);
3698 return next;
3699 }
3700
3701 /*
3702 * Disk writes.
3703 *
3704 * The dependency structures constructed above are most actively used when file
3705 * system blocks are written to disk. No constraints are placed on when a
3706 * block can be written, but unsatisfied update dependencies are made safe by
3707 * modifying (or replacing) the source memory for the duration of the disk
3708 * write. When the disk write completes, the memory block is again brought
3709 * up-to-date.
3710 *
3711 * In-core inode structure reclamation.
3712 *
3713 * Because there are a finite number of "in-core" inode structures, they are
3714 * reused regularly. By transferring all inode-related dependencies to the
3715 * in-memory inode block and indexing them separately (via "inodedep"s), we
3716 * can allow "in-core" inode structures to be reused at any time and avoid
3717 * any increase in contention.
3718 *
3719 * Called just before entering the device driver to initiate a new disk I/O.
3720 * The buffer must be locked, thus, no I/O completion operations can occur
3721 * while we are manipulating its associated dependencies.
3722 */
3723 static void
3724 softdep_disk_io_initiation(bp)
3725 struct buf *bp; /* structure describing disk write to occur */
3726 {
3727 struct worklist *wk;
3728 struct worklist marker;
3729 struct indirdep *indirdep;
3730 struct inodedep *inodedep;
3731
3732 /*
3733 * We only care about write operations. There should never
3734 * be dependencies for reads.
3735 */
3736 if (bp->b_iocmd != BIO_WRITE)
3737 panic("softdep_disk_io_initiation: not write");
3738
3739 marker.wk_type = D_LAST + 1; /* Not a normal workitem */
3740 PHOLD(curproc); /* Don't swap out kernel stack */
3741
3742 ACQUIRE_LOCK(&lk);
3743 /*
3744 * Do any necessary pre-I/O processing.
3745 */
3746 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
3747 wk = markernext(&marker)) {
3748 LIST_INSERT_AFTER(wk, &marker, wk_list);
3749 switch (wk->wk_type) {
3750
3751 case D_PAGEDEP:
3752 initiate_write_filepage(WK_PAGEDEP(wk), bp);
3753 continue;
3754
3755 case D_INODEDEP:
3756 inodedep = WK_INODEDEP(wk);
3757 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3758 initiate_write_inodeblock_ufs1(inodedep, bp);
3759 else
3760 initiate_write_inodeblock_ufs2(inodedep, bp);
3761 continue;
3762
3763 case D_INDIRDEP:
3764 indirdep = WK_INDIRDEP(wk);
3765 if (indirdep->ir_state & GOINGAWAY)
3766 panic("disk_io_initiation: indirdep gone");
3767 /*
3768 * If there are no remaining dependencies, this
3769 * will be writing the real pointers, so the
3770 * dependency can be freed.
3771 */
3772 if (LIST_EMPTY(&indirdep->ir_deplisthd)) {
3773 struct buf *bp;
3774
3775 bp = indirdep->ir_savebp;
3776 bp->b_flags |= B_INVAL | B_NOCACHE;
3777 /* inline expand WORKLIST_REMOVE(wk); */
3778 wk->wk_state &= ~ONWORKLIST;
3779 LIST_REMOVE(wk, wk_list);
3780 WORKITEM_FREE(indirdep, D_INDIRDEP);
3781 FREE_LOCK(&lk);
3782 brelse(bp);
3783 ACQUIRE_LOCK(&lk);
3784 continue;
3785 }
3786 /*
3787 * Replace up-to-date version with safe version.
3788 */
3789 FREE_LOCK(&lk);
3790 MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
3791 M_INDIRDEP, M_SOFTDEP_FLAGS);
3792 ACQUIRE_LOCK(&lk);
3793 indirdep->ir_state &= ~ATTACHED;
3794 indirdep->ir_state |= UNDONE;
3795 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3796 bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3797 bp->b_bcount);
3798 continue;
3799
3800 case D_MKDIR:
3801 case D_BMSAFEMAP:
3802 case D_ALLOCDIRECT:
3803 case D_ALLOCINDIR:
3804 continue;
3805
3806 default:
3807 panic("handle_disk_io_initiation: Unexpected type %s",
3808 TYPENAME(wk->wk_type));
3809 /* NOTREACHED */
3810 }
3811 }
3812 FREE_LOCK(&lk);
3813 PRELE(curproc); /* Allow swapout of kernel stack */
3814 }
3815
3816 /*
3817 * Called from within the procedure above to deal with unsatisfied
3818 * allocation dependencies in a directory. The buffer must be locked,
3819 * thus, no I/O completion operations can occur while we are
3820 * manipulating its associated dependencies.
3821 */
3822 static void
3823 initiate_write_filepage(pagedep, bp)
3824 struct pagedep *pagedep;
3825 struct buf *bp;
3826 {
3827 struct diradd *dap;
3828 struct direct *ep;
3829 int i;
3830
3831 if (pagedep->pd_state & IOSTARTED) {
3832 /*
3833 * This can only happen if there is a driver that does not
3834 * understand chaining. Here biodone will reissue the call
3835 * to strategy for the incomplete buffers.
3836 */
3837 printf("initiate_write_filepage: already started\n");
3838 return;
3839 }
3840 pagedep->pd_state |= IOSTARTED;
3841 for (i = 0; i < DAHASHSZ; i++) {
3842 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3843 ep = (struct direct *)
3844 ((char *)bp->b_data + dap->da_offset);
3845 if (ep->d_ino != dap->da_newinum)
3846 panic("%s: dir inum %d != new %d",
3847 "initiate_write_filepage",
3848 ep->d_ino, dap->da_newinum);
3849 if (dap->da_state & DIRCHG)
3850 ep->d_ino = dap->da_previous->dm_oldinum;
3851 else
3852 ep->d_ino = 0;
3853 dap->da_state &= ~ATTACHED;
3854 dap->da_state |= UNDONE;
3855 }
3856 }
3857 }
3858
3859 /*
3860 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
3861 * Note that any bug fixes made to this routine must be done in the
3862 * version found below.
3863 *
3864 * Called from within the procedure above to deal with unsatisfied
3865 * allocation dependencies in an inodeblock. The buffer must be
3866 * locked, thus, no I/O completion operations can occur while we
3867 * are manipulating its associated dependencies.
3868 */
3869 static void
3870 initiate_write_inodeblock_ufs1(inodedep, bp)
3871 struct inodedep *inodedep;
3872 struct buf *bp; /* The inode block */
3873 {
3874 struct allocdirect *adp, *lastadp;
3875 struct ufs1_dinode *dp;
3876 struct ufs1_dinode *sip;
3877 struct fs *fs;
3878 ufs_lbn_t i;
3879 #ifdef INVARIANTS
3880 ufs_lbn_t prevlbn = 0;
3881 #endif
3882 int deplist;
3883
3884 if (inodedep->id_state & IOSTARTED)
3885 panic("initiate_write_inodeblock_ufs1: already started");
3886 inodedep->id_state |= IOSTARTED;
3887 fs = inodedep->id_fs;
3888 dp = (struct ufs1_dinode *)bp->b_data +
3889 ino_to_fsbo(fs, inodedep->id_ino);
3890 /*
3891 * If the bitmap is not yet written, then the allocated
3892 * inode cannot be written to disk.
3893 */
3894 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3895 if (inodedep->id_savedino1 != NULL)
3896 panic("initiate_write_inodeblock_ufs1: I/O underway");
3897 FREE_LOCK(&lk);
3898 MALLOC(sip, struct ufs1_dinode *,
3899 sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
3900 ACQUIRE_LOCK(&lk);
3901 inodedep->id_savedino1 = sip;
3902 *inodedep->id_savedino1 = *dp;
3903 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3904 dp->di_gen = inodedep->id_savedino1->di_gen;
3905 return;
3906 }
3907 /*
3908 * If no dependencies, then there is nothing to roll back.
3909 */
3910 inodedep->id_savedsize = dp->di_size;
3911 inodedep->id_savedextsize = 0;
3912 if (TAILQ_EMPTY(&inodedep->id_inoupdt))
3913 return;
3914 /*
3915 * Set the dependencies to busy.
3916 */
3917 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3918 adp = TAILQ_NEXT(adp, ad_next)) {
3919 #ifdef INVARIANTS
3920 if (deplist != 0 && prevlbn >= adp->ad_lbn)
3921 panic("softdep_write_inodeblock: lbn order");
3922 prevlbn = adp->ad_lbn;
3923 if (adp->ad_lbn < NDADDR &&
3924 dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
3925 panic("%s: direct pointer #%jd mismatch %d != %jd",
3926 "softdep_write_inodeblock",
3927 (intmax_t)adp->ad_lbn,
3928 dp->di_db[adp->ad_lbn],
3929 (intmax_t)adp->ad_newblkno);
3930 if (adp->ad_lbn >= NDADDR &&
3931 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
3932 panic("%s: indirect pointer #%jd mismatch %d != %jd",
3933 "softdep_write_inodeblock",
3934 (intmax_t)adp->ad_lbn - NDADDR,
3935 dp->di_ib[adp->ad_lbn - NDADDR],
3936 (intmax_t)adp->ad_newblkno);
3937 deplist |= 1 << adp->ad_lbn;
3938 if ((adp->ad_state & ATTACHED) == 0)
3939 panic("softdep_write_inodeblock: Unknown state 0x%x",
3940 adp->ad_state);
3941 #endif /* INVARIANTS */
3942 adp->ad_state &= ~ATTACHED;
3943 adp->ad_state |= UNDONE;
3944 }
3945 /*
3946 * The on-disk inode cannot claim to be any larger than the last
3947 * fragment that has been written. Otherwise, the on-disk inode
3948 * might have fragments that were not the last block in the file
3949 * which would corrupt the filesystem.
3950 */
3951 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3952 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3953 if (adp->ad_lbn >= NDADDR)
3954 break;
3955 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3956 /* keep going until hitting a rollback to a frag */
3957 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3958 continue;
3959 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3960 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3961 #ifdef INVARIANTS
3962 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
3963 panic("softdep_write_inodeblock: lost dep1");
3964 #endif /* INVARIANTS */
3965 dp->di_db[i] = 0;
3966 }
3967 for (i = 0; i < NIADDR; i++) {
3968 #ifdef INVARIANTS
3969 if (dp->di_ib[i] != 0 &&
3970 (deplist & ((1 << NDADDR) << i)) == 0)
3971 panic("softdep_write_inodeblock: lost dep2");
3972 #endif /* INVARIANTS */
3973 dp->di_ib[i] = 0;
3974 }
3975 return;
3976 }
3977 /*
3978 * If we have zero'ed out the last allocated block of the file,
3979 * roll back the size to the last currently allocated block.
3980 * We know that this last allocated block is a full-sized as
3981 * we already checked for fragments in the loop above.
3982 */
3983 if (lastadp != NULL &&
3984 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3985 for (i = lastadp->ad_lbn; i >= 0; i--)
3986 if (dp->di_db[i] != 0)
3987 break;
3988 dp->di_size = (i + 1) * fs->fs_bsize;
3989 }
3990 /*
3991 * The only dependencies are for indirect blocks.
3992 *
3993 * The file size for indirect block additions is not guaranteed.
3994 * Such a guarantee would be non-trivial to achieve. The conventional
3995 * synchronous write implementation also does not make this guarantee.
3996 * Fsck should catch and fix discrepancies. Arguably, the file size
3997 * can be over-estimated without destroying integrity when the file
3998 * moves into the indirect blocks (i.e., is large). If we want to
3999 * postpone fsck, we are stuck with this argument.
4000 */
4001 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4002 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4003 }
4004
4005 /*
4006 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
4007 * Note that any bug fixes made to this routine must be done in the
4008 * version found above.
4009 *
4010 * Called from within the procedure above to deal with unsatisfied
4011 * allocation dependencies in an inodeblock. The buffer must be
4012 * locked, thus, no I/O completion operations can occur while we
4013 * are manipulating its associated dependencies.
4014 */
4015 static void
4016 initiate_write_inodeblock_ufs2(inodedep, bp)
4017 struct inodedep *inodedep;
4018 struct buf *bp; /* The inode block */
4019 {
4020 struct allocdirect *adp, *lastadp;
4021 struct ufs2_dinode *dp;
4022 struct ufs2_dinode *sip;
4023 struct fs *fs;
4024 ufs_lbn_t i;
4025 #ifdef INVARIANTS
4026 ufs_lbn_t prevlbn = 0;
4027 #endif
4028 int deplist;
4029
4030 if (inodedep->id_state & IOSTARTED)
4031 panic("initiate_write_inodeblock_ufs2: already started");
4032 inodedep->id_state |= IOSTARTED;
4033 fs = inodedep->id_fs;
4034 dp = (struct ufs2_dinode *)bp->b_data +
4035 ino_to_fsbo(fs, inodedep->id_ino);
4036 /*
4037 * If the bitmap is not yet written, then the allocated
4038 * inode cannot be written to disk.
4039 */
4040 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
4041 if (inodedep->id_savedino2 != NULL)
4042 panic("initiate_write_inodeblock_ufs2: I/O underway");
4043 FREE_LOCK(&lk);
4044 MALLOC(sip, struct ufs2_dinode *,
4045 sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
4046 ACQUIRE_LOCK(&lk);
4047 inodedep->id_savedino2 = sip;
4048 *inodedep->id_savedino2 = *dp;
4049 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
4050 dp->di_gen = inodedep->id_savedino2->di_gen;
4051 return;
4052 }
4053 /*
4054 * If no dependencies, then there is nothing to roll back.
4055 */
4056 inodedep->id_savedsize = dp->di_size;
4057 inodedep->id_savedextsize = dp->di_extsize;
4058 if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
4059 TAILQ_EMPTY(&inodedep->id_extupdt))
4060 return;
4061 /*
4062 * Set the ext data dependencies to busy.
4063 */
4064 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4065 adp = TAILQ_NEXT(adp, ad_next)) {
4066 #ifdef INVARIANTS
4067 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4068 panic("softdep_write_inodeblock: lbn order");
4069 prevlbn = adp->ad_lbn;
4070 if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
4071 panic("%s: direct pointer #%jd mismatch %jd != %jd",
4072 "softdep_write_inodeblock",
4073 (intmax_t)adp->ad_lbn,
4074 (intmax_t)dp->di_extb[adp->ad_lbn],
4075 (intmax_t)adp->ad_newblkno);
4076 deplist |= 1 << adp->ad_lbn;
4077 if ((adp->ad_state & ATTACHED) == 0)
4078 panic("softdep_write_inodeblock: Unknown state 0x%x",
4079 adp->ad_state);
4080 #endif /* INVARIANTS */
4081 adp->ad_state &= ~ATTACHED;
4082 adp->ad_state |= UNDONE;
4083 }
4084 /*
4085 * The on-disk inode cannot claim to be any larger than the last
4086 * fragment that has been written. Otherwise, the on-disk inode
4087 * might have fragments that were not the last block in the ext
4088 * data which would corrupt the filesystem.
4089 */
4090 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
4091 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4092 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
4093 /* keep going until hitting a rollback to a frag */
4094 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4095 continue;
4096 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4097 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
4098 #ifdef INVARIANTS
4099 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
4100 panic("softdep_write_inodeblock: lost dep1");
4101 #endif /* INVARIANTS */
4102 dp->di_extb[i] = 0;
4103 }
4104 lastadp = NULL;
4105 break;
4106 }
4107 /*
4108 * If we have zero'ed out the last allocated block of the ext
4109 * data, roll back the size to the last currently allocated block.
4110 * We know that this last allocated block is a full-sized as
4111 * we already checked for fragments in the loop above.
4112 */
4113 if (lastadp != NULL &&
4114 dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4115 for (i = lastadp->ad_lbn; i >= 0; i--)
4116 if (dp->di_extb[i] != 0)
4117 break;
4118 dp->di_extsize = (i + 1) * fs->fs_bsize;
4119 }
4120 /*
4121 * Set the file data dependencies to busy.
4122 */
4123 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4124 adp = TAILQ_NEXT(adp, ad_next)) {
4125 #ifdef INVARIANTS
4126 if (deplist != 0 && prevlbn >= adp->ad_lbn)
4127 panic("softdep_write_inodeblock: lbn order");
4128 prevlbn = adp->ad_lbn;
4129 if (adp->ad_lbn < NDADDR &&
4130 dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
4131 panic("%s: direct pointer #%jd mismatch %jd != %jd",
4132 "softdep_write_inodeblock",
4133 (intmax_t)adp->ad_lbn,
4134 (intmax_t)dp->di_db[adp->ad_lbn],
4135 (intmax_t)adp->ad_newblkno);
4136 if (adp->ad_lbn >= NDADDR &&
4137 dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
4138 panic("%s indirect pointer #%jd mismatch %jd != %jd",
4139 "softdep_write_inodeblock:",
4140 (intmax_t)adp->ad_lbn - NDADDR,
4141 (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
4142 (intmax_t)adp->ad_newblkno);
4143 deplist |= 1 << adp->ad_lbn;
4144 if ((adp->ad_state & ATTACHED) == 0)
4145 panic("softdep_write_inodeblock: Unknown state 0x%x",
4146 adp->ad_state);
4147 #endif /* INVARIANTS */
4148 adp->ad_state &= ~ATTACHED;
4149 adp->ad_state |= UNDONE;
4150 }
4151 /*
4152 * The on-disk inode cannot claim to be any larger than the last
4153 * fragment that has been written. Otherwise, the on-disk inode
4154 * might have fragments that were not the last block in the file
4155 * which would corrupt the filesystem.
4156 */
4157 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
4158 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
4159 if (adp->ad_lbn >= NDADDR)
4160 break;
4161 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
4162 /* keep going until hitting a rollback to a frag */
4163 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
4164 continue;
4165 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
4166 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
4167 #ifdef INVARIANTS
4168 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
4169 panic("softdep_write_inodeblock: lost dep2");
4170 #endif /* INVARIANTS */
4171 dp->di_db[i] = 0;
4172 }
4173 for (i = 0; i < NIADDR; i++) {
4174 #ifdef INVARIANTS
4175 if (dp->di_ib[i] != 0 &&
4176 (deplist & ((1 << NDADDR) << i)) == 0)
4177 panic("softdep_write_inodeblock: lost dep3");
4178 #endif /* INVARIANTS */
4179 dp->di_ib[i] = 0;
4180 }
4181 return;
4182 }
4183 /*
4184 * If we have zero'ed out the last allocated block of the file,
4185 * roll back the size to the last currently allocated block.
4186 * We know that this last allocated block is a full-sized as
4187 * we already checked for fragments in the loop above.
4188 */
4189 if (lastadp != NULL &&
4190 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
4191 for (i = lastadp->ad_lbn; i >= 0; i--)
4192 if (dp->di_db[i] != 0)
4193 break;
4194 dp->di_size = (i + 1) * fs->fs_bsize;
4195 }
4196 /*
4197 * The only dependencies are for indirect blocks.
4198 *
4199 * The file size for indirect block additions is not guaranteed.
4200 * Such a guarantee would be non-trivial to achieve. The conventional
4201 * synchronous write implementation also does not make this guarantee.
4202 * Fsck should catch and fix discrepancies. Arguably, the file size
4203 * can be over-estimated without destroying integrity when the file
4204 * moves into the indirect blocks (i.e., is large). If we want to
4205 * postpone fsck, we are stuck with this argument.
4206 */
4207 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
4208 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
4209 }
4210
4211 /*
4212 * This routine is called during the completion interrupt
4213 * service routine for a disk write (from the procedure called
4214 * by the device driver to inform the filesystem caches of
4215 * a request completion). It should be called early in this
4216 * procedure, before the block is made available to other
4217 * processes or other routines are called.
4218 */
4219 static void
4220 softdep_disk_write_complete(bp)
4221 struct buf *bp; /* describes the completed disk write */
4222 {
4223 struct worklist *wk;
4224 struct worklist *owk;
4225 struct workhead reattach;
4226 struct newblk *newblk;
4227 struct allocindir *aip;
4228 struct allocdirect *adp;
4229 struct indirdep *indirdep;
4230 struct inodedep *inodedep;
4231 struct bmsafemap *bmsafemap;
4232
4233 /*
4234 * If an error occurred while doing the write, then the data
4235 * has not hit the disk and the dependencies cannot be unrolled.
4236 */
4237 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
4238 return;
4239 LIST_INIT(&reattach);
4240 /*
4241 * This lock must not be released anywhere in this code segment.
4242 */
4243 ACQUIRE_LOCK(&lk);
4244 owk = NULL;
4245 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
4246 WORKLIST_REMOVE(wk);
4247 if (wk == owk)
4248 panic("duplicate worklist: %p\n", wk);
4249 owk = wk;
4250 switch (wk->wk_type) {
4251
4252 case D_PAGEDEP:
4253 if (handle_written_filepage(WK_PAGEDEP(wk), bp))
4254 WORKLIST_INSERT(&reattach, wk);
4255 continue;
4256
4257 case D_INODEDEP:
4258 if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
4259 WORKLIST_INSERT(&reattach, wk);
4260 continue;
4261
4262 case D_BMSAFEMAP:
4263 bmsafemap = WK_BMSAFEMAP(wk);
4264 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
4265 newblk->nb_state |= DEPCOMPLETE;
4266 newblk->nb_bmsafemap = NULL;
4267 LIST_REMOVE(newblk, nb_deps);
4268 }
4269 while ((adp =
4270 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
4271 adp->ad_state |= DEPCOMPLETE;
4272 adp->ad_buf = NULL;
4273 LIST_REMOVE(adp, ad_deps);
4274 handle_allocdirect_partdone(adp);
4275 }
4276 while ((aip =
4277 LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
4278 aip->ai_state |= DEPCOMPLETE;
4279 aip->ai_buf = NULL;
4280 LIST_REMOVE(aip, ai_deps);
4281 handle_allocindir_partdone(aip);
4282 }
4283 while ((inodedep =
4284 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
4285 inodedep->id_state |= DEPCOMPLETE;
4286 LIST_REMOVE(inodedep, id_deps);
4287 inodedep->id_buf = NULL;
4288 }
4289 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
4290 continue;
4291
4292 case D_MKDIR:
4293 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
4294 continue;
4295
4296 case D_ALLOCDIRECT:
4297 adp = WK_ALLOCDIRECT(wk);
4298 adp->ad_state |= COMPLETE;
4299 handle_allocdirect_partdone(adp);
4300 continue;
4301
4302 case D_ALLOCINDIR:
4303 aip = WK_ALLOCINDIR(wk);
4304 aip->ai_state |= COMPLETE;
4305 handle_allocindir_partdone(aip);
4306 continue;
4307
4308 case D_INDIRDEP:
4309 indirdep = WK_INDIRDEP(wk);
4310 if (indirdep->ir_state & GOINGAWAY)
4311 panic("disk_write_complete: indirdep gone");
4312 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
4313 FREE(indirdep->ir_saveddata, M_INDIRDEP);
4314 indirdep->ir_saveddata = 0;
4315 indirdep->ir_state &= ~UNDONE;
4316 indirdep->ir_state |= ATTACHED;
4317 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
4318 handle_allocindir_partdone(aip);
4319 if (aip == LIST_FIRST(&indirdep->ir_donehd))
4320 panic("disk_write_complete: not gone");
4321 }
4322 WORKLIST_INSERT(&reattach, wk);
4323 if ((bp->b_flags & B_DELWRI) == 0)
4324 stat_indir_blk_ptrs++;
4325 bdirty(bp);
4326 continue;
4327
4328 default:
4329 panic("handle_disk_write_complete: Unknown type %s",
4330 TYPENAME(wk->wk_type));
4331 /* NOTREACHED */
4332 }
4333 }
4334 /*
4335 * Reattach any requests that must be redone.
4336 */
4337 while ((wk = LIST_FIRST(&reattach)) != NULL) {
4338 WORKLIST_REMOVE(wk);
4339 WORKLIST_INSERT(&bp->b_dep, wk);
4340 }
4341 FREE_LOCK(&lk);
4342 }
4343
4344 /*
4345 * Called from within softdep_disk_write_complete above. Note that
4346 * this routine is always called from interrupt level with further
4347 * splbio interrupts blocked.
4348 */
4349 static void
4350 handle_allocdirect_partdone(adp)
4351 struct allocdirect *adp; /* the completed allocdirect */
4352 {
4353 struct allocdirectlst *listhead;
4354 struct allocdirect *listadp;
4355 struct inodedep *inodedep;
4356 long bsize, delay;
4357
4358 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4359 return;
4360 if (adp->ad_buf != NULL)
4361 panic("handle_allocdirect_partdone: dangling dep");
4362 /*
4363 * The on-disk inode cannot claim to be any larger than the last
4364 * fragment that has been written. Otherwise, the on-disk inode
4365 * might have fragments that were not the last block in the file
4366 * which would corrupt the filesystem. Thus, we cannot free any
4367 * allocdirects after one whose ad_oldblkno claims a fragment as
4368 * these blocks must be rolled back to zero before writing the inode.
4369 * We check the currently active set of allocdirects in id_inoupdt
4370 * or id_extupdt as appropriate.
4371 */
4372 inodedep = adp->ad_inodedep;
4373 bsize = inodedep->id_fs->fs_bsize;
4374 if (adp->ad_state & EXTDATA)
4375 listhead = &inodedep->id_extupdt;
4376 else
4377 listhead = &inodedep->id_inoupdt;
4378 TAILQ_FOREACH(listadp, listhead, ad_next) {
4379 /* found our block */
4380 if (listadp == adp)
4381 break;
4382 /* continue if ad_oldlbn is not a fragment */
4383 if (listadp->ad_oldsize == 0 ||
4384 listadp->ad_oldsize == bsize)
4385 continue;
4386 /* hit a fragment */
4387 return;
4388 }
4389 /*
4390 * If we have reached the end of the current list without
4391 * finding the just finished dependency, then it must be
4392 * on the future dependency list. Future dependencies cannot
4393 * be freed until they are moved to the current list.
4394 */
4395 if (listadp == NULL) {
4396 #ifdef DEBUG
4397 if (adp->ad_state & EXTDATA)
4398 listhead = &inodedep->id_newextupdt;
4399 else
4400 listhead = &inodedep->id_newinoupdt;
4401 TAILQ_FOREACH(listadp, listhead, ad_next)
4402 /* found our block */
4403 if (listadp == adp)
4404 break;
4405 if (listadp == NULL)
4406 panic("handle_allocdirect_partdone: lost dep");
4407 #endif /* DEBUG */
4408 return;
4409 }
4410 /*
4411 * If we have found the just finished dependency, then free
4412 * it along with anything that follows it that is complete.
4413 * If the inode still has a bitmap dependency, then it has
4414 * never been written to disk, hence the on-disk inode cannot
4415 * reference the old fragment so we can free it without delay.
4416 */
4417 delay = (inodedep->id_state & DEPCOMPLETE);
4418 for (; adp; adp = listadp) {
4419 listadp = TAILQ_NEXT(adp, ad_next);
4420 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
4421 return;
4422 free_allocdirect(listhead, adp, delay);
4423 }
4424 }
4425
4426 /*
4427 * Called from within softdep_disk_write_complete above. Note that
4428 * this routine is always called from interrupt level with further
4429 * splbio interrupts blocked.
4430 */
4431 static void
4432 handle_allocindir_partdone(aip)
4433 struct allocindir *aip; /* the completed allocindir */
4434 {
4435 struct indirdep *indirdep;
4436
4437 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4438 return;
4439 if (aip->ai_buf != NULL)
4440 panic("handle_allocindir_partdone: dangling dependency");
4441 indirdep = aip->ai_indirdep;
4442 if (indirdep->ir_state & UNDONE) {
4443 LIST_REMOVE(aip, ai_next);
4444 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4445 return;
4446 }
4447 if (indirdep->ir_state & UFS1FMT)
4448 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4449 aip->ai_newblkno;
4450 else
4451 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4452 aip->ai_newblkno;
4453 LIST_REMOVE(aip, ai_next);
4454 if (aip->ai_freefrag != NULL)
4455 add_to_worklist(&aip->ai_freefrag->ff_list);
4456 WORKITEM_FREE(aip, D_ALLOCINDIR);
4457 }
4458
4459 /*
4460 * Called from within softdep_disk_write_complete above to restore
4461 * in-memory inode block contents to their most up-to-date state. Note
4462 * that this routine is always called from interrupt level with further
4463 * splbio interrupts blocked.
4464 */
4465 static int
4466 handle_written_inodeblock(inodedep, bp)
4467 struct inodedep *inodedep;
4468 struct buf *bp; /* buffer containing the inode block */
4469 {
4470 struct worklist *wk, *filefree;
4471 struct allocdirect *adp, *nextadp;
4472 struct ufs1_dinode *dp1 = NULL;
4473 struct ufs2_dinode *dp2 = NULL;
4474 int hadchanges, fstype;
4475
4476 if ((inodedep->id_state & IOSTARTED) == 0)
4477 panic("handle_written_inodeblock: not started");
4478 inodedep->id_state &= ~IOSTARTED;
4479 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4480 fstype = UFS1;
4481 dp1 = (struct ufs1_dinode *)bp->b_data +
4482 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4483 } else {
4484 fstype = UFS2;
4485 dp2 = (struct ufs2_dinode *)bp->b_data +
4486 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4487 }
4488 /*
4489 * If we had to rollback the inode allocation because of
4490 * bitmaps being incomplete, then simply restore it.
4491 * Keep the block dirty so that it will not be reclaimed until
4492 * all associated dependencies have been cleared and the
4493 * corresponding updates written to disk.
4494 */
4495 if (inodedep->id_savedino1 != NULL) {
4496 if (fstype == UFS1)
4497 *dp1 = *inodedep->id_savedino1;
4498 else
4499 *dp2 = *inodedep->id_savedino2;
4500 FREE(inodedep->id_savedino1, M_SAVEDINO);
4501 inodedep->id_savedino1 = NULL;
4502 if ((bp->b_flags & B_DELWRI) == 0)
4503 stat_inode_bitmap++;
4504 bdirty(bp);
4505 return (1);
4506 }
4507 inodedep->id_state |= COMPLETE;
4508 /*
4509 * Roll forward anything that had to be rolled back before
4510 * the inode could be updated.
4511 */
4512 hadchanges = 0;
4513 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4514 nextadp = TAILQ_NEXT(adp, ad_next);
4515 if (adp->ad_state & ATTACHED)
4516 panic("handle_written_inodeblock: new entry");
4517 if (fstype == UFS1) {
4518 if (adp->ad_lbn < NDADDR) {
4519 if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4520 panic("%s %s #%jd mismatch %d != %jd",
4521 "handle_written_inodeblock:",
4522 "direct pointer",
4523 (intmax_t)adp->ad_lbn,
4524 dp1->di_db[adp->ad_lbn],
4525 (intmax_t)adp->ad_oldblkno);
4526 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4527 } else {
4528 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4529 panic("%s: %s #%jd allocated as %d",
4530 "handle_written_inodeblock",
4531 "indirect pointer",
4532 (intmax_t)adp->ad_lbn - NDADDR,
4533 dp1->di_ib[adp->ad_lbn - NDADDR]);
4534 dp1->di_ib[adp->ad_lbn - NDADDR] =
4535 adp->ad_newblkno;
4536 }
4537 } else {
4538 if (adp->ad_lbn < NDADDR) {
4539 if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
4540 panic("%s: %s #%jd %s %jd != %jd",
4541 "handle_written_inodeblock",
4542 "direct pointer",
4543 (intmax_t)adp->ad_lbn, "mismatch",
4544 (intmax_t)dp2->di_db[adp->ad_lbn],
4545 (intmax_t)adp->ad_oldblkno);
4546 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4547 } else {
4548 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4549 panic("%s: %s #%jd allocated as %jd",
4550 "handle_written_inodeblock",
4551 "indirect pointer",
4552 (intmax_t)adp->ad_lbn - NDADDR,
4553 (intmax_t)
4554 dp2->di_ib[adp->ad_lbn - NDADDR]);
4555 dp2->di_ib[adp->ad_lbn - NDADDR] =
4556 adp->ad_newblkno;
4557 }
4558 }
4559 adp->ad_state &= ~UNDONE;
4560 adp->ad_state |= ATTACHED;
4561 hadchanges = 1;
4562 }
4563 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
4564 nextadp = TAILQ_NEXT(adp, ad_next);
4565 if (adp->ad_state & ATTACHED)
4566 panic("handle_written_inodeblock: new entry");
4567 if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
4568 panic("%s: direct pointers #%jd %s %jd != %jd",
4569 "handle_written_inodeblock",
4570 (intmax_t)adp->ad_lbn, "mismatch",
4571 (intmax_t)dp2->di_extb[adp->ad_lbn],
4572 (intmax_t)adp->ad_oldblkno);
4573 dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
4574 adp->ad_state &= ~UNDONE;
4575 adp->ad_state |= ATTACHED;
4576 hadchanges = 1;
4577 }
4578 if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4579 stat_direct_blk_ptrs++;
4580 /*
4581 * Reset the file size to its most up-to-date value.
4582 */
4583 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
4584 panic("handle_written_inodeblock: bad size");
4585 if (fstype == UFS1) {
4586 if (dp1->di_size != inodedep->id_savedsize) {
4587 dp1->di_size = inodedep->id_savedsize;
4588 hadchanges = 1;
4589 }
4590 } else {
4591 if (dp2->di_size != inodedep->id_savedsize) {
4592 dp2->di_size = inodedep->id_savedsize;
4593 hadchanges = 1;
4594 }
4595 if (dp2->di_extsize != inodedep->id_savedextsize) {
4596 dp2->di_extsize = inodedep->id_savedextsize;
4597 hadchanges = 1;
4598 }
4599 }
4600 inodedep->id_savedsize = -1;
4601 inodedep->id_savedextsize = -1;
4602 /*
4603 * If there were any rollbacks in the inode block, then it must be
4604 * marked dirty so that its will eventually get written back in
4605 * its correct form.
4606 */
4607 if (hadchanges)
4608 bdirty(bp);
4609 /*
4610 * Process any allocdirects that completed during the update.
4611 */
4612 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4613 handle_allocdirect_partdone(adp);
4614 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
4615 handle_allocdirect_partdone(adp);
4616 /*
4617 * Process deallocations that were held pending until the
4618 * inode had been written to disk. Freeing of the inode
4619 * is delayed until after all blocks have been freed to
4620 * avoid creation of new <vfsid, inum, lbn> triples
4621 * before the old ones have been deleted.
4622 */
4623 filefree = NULL;
4624 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4625 WORKLIST_REMOVE(wk);
4626 switch (wk->wk_type) {
4627
4628 case D_FREEFILE:
4629 /*
4630 * We defer adding filefree to the worklist until
4631 * all other additions have been made to ensure
4632 * that it will be done after all the old blocks
4633 * have been freed.
4634 */
4635 if (filefree != NULL)
4636 panic("handle_written_inodeblock: filefree");
4637 filefree = wk;
4638 continue;
4639
4640 case D_MKDIR:
4641 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4642 continue;
4643
4644 case D_DIRADD:
4645 diradd_inode_written(WK_DIRADD(wk), inodedep);
4646 continue;
4647
4648 case D_FREEBLKS:
4649 wk->wk_state |= COMPLETE;
4650 if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
4651 continue;
4652 /* -- fall through -- */
4653 case D_FREEFRAG:
4654 case D_DIRREM:
4655 add_to_worklist(wk);
4656 continue;
4657
4658 case D_NEWDIRBLK:
4659 free_newdirblk(WK_NEWDIRBLK(wk));
4660 continue;
4661
4662 default:
4663 panic("handle_written_inodeblock: Unknown type %s",
4664 TYPENAME(wk->wk_type));
4665 /* NOTREACHED */
4666 }
4667 }
4668 if (filefree != NULL) {
4669 if (free_inodedep(inodedep) == 0)
4670 panic("handle_written_inodeblock: live inodedep");
4671 add_to_worklist(filefree);
4672 return (0);
4673 }
4674
4675 /*
4676 * If no outstanding dependencies, free it.
4677 */
4678 if (free_inodedep(inodedep) ||
4679 (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
4680 TAILQ_FIRST(&inodedep->id_extupdt) == 0))
4681 return (0);
4682 return (hadchanges);
4683 }
4684
4685 /*
4686 * Process a diradd entry after its dependent inode has been written.
4687 * This routine must be called with splbio interrupts blocked.
4688 */
4689 static void
4690 diradd_inode_written(dap, inodedep)
4691 struct diradd *dap;
4692 struct inodedep *inodedep;
4693 {
4694 struct pagedep *pagedep;
4695
4696 dap->da_state |= COMPLETE;
4697 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4698 if (dap->da_state & DIRCHG)
4699 pagedep = dap->da_previous->dm_pagedep;
4700 else
4701 pagedep = dap->da_pagedep;
4702 LIST_REMOVE(dap, da_pdlist);
4703 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4704 }
4705 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4706 }
4707
4708 /*
4709 * Handle the completion of a mkdir dependency.
4710 */
4711 static void
4712 handle_written_mkdir(mkdir, type)
4713 struct mkdir *mkdir;
4714 int type;
4715 {
4716 struct diradd *dap;
4717 struct pagedep *pagedep;
4718
4719 if (mkdir->md_state != type)
4720 panic("handle_written_mkdir: bad type");
4721 dap = mkdir->md_diradd;
4722 dap->da_state &= ~type;
4723 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4724 dap->da_state |= DEPCOMPLETE;
4725 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4726 if (dap->da_state & DIRCHG)
4727 pagedep = dap->da_previous->dm_pagedep;
4728 else
4729 pagedep = dap->da_pagedep;
4730 LIST_REMOVE(dap, da_pdlist);
4731 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4732 }
4733 LIST_REMOVE(mkdir, md_mkdirs);
4734 WORKITEM_FREE(mkdir, D_MKDIR);
4735 }
4736
4737 /*
4738 * Called from within softdep_disk_write_complete above.
4739 * A write operation was just completed. Removed inodes can
4740 * now be freed and associated block pointers may be committed.
4741 * Note that this routine is always called from interrupt level
4742 * with further splbio interrupts blocked.
4743 */
4744 static int
4745 handle_written_filepage(pagedep, bp)
4746 struct pagedep *pagedep;
4747 struct buf *bp; /* buffer containing the written page */
4748 {
4749 struct dirrem *dirrem;
4750 struct diradd *dap, *nextdap;
4751 struct direct *ep;
4752 int i, chgs;
4753
4754 if ((pagedep->pd_state & IOSTARTED) == 0)
4755 panic("handle_written_filepage: not started");
4756 pagedep->pd_state &= ~IOSTARTED;
4757 /*
4758 * Process any directory removals that have been committed.
4759 */
4760 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4761 LIST_REMOVE(dirrem, dm_next);
4762 dirrem->dm_dirinum = pagedep->pd_ino;
4763 add_to_worklist(&dirrem->dm_list);
4764 }
4765 /*
4766 * Free any directory additions that have been committed.
4767 * If it is a newly allocated block, we have to wait until
4768 * the on-disk directory inode claims the new block.
4769 */
4770 if ((pagedep->pd_state & NEWBLOCK) == 0)
4771 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4772 free_diradd(dap);
4773 /*
4774 * Uncommitted directory entries must be restored.
4775 */
4776 for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4777 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4778 dap = nextdap) {
4779 nextdap = LIST_NEXT(dap, da_pdlist);
4780 if (dap->da_state & ATTACHED)
4781 panic("handle_written_filepage: attached");
4782 ep = (struct direct *)
4783 ((char *)bp->b_data + dap->da_offset);
4784 ep->d_ino = dap->da_newinum;
4785 dap->da_state &= ~UNDONE;
4786 dap->da_state |= ATTACHED;
4787 chgs = 1;
4788 /*
4789 * If the inode referenced by the directory has
4790 * been written out, then the dependency can be
4791 * moved to the pending list.
4792 */
4793 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4794 LIST_REMOVE(dap, da_pdlist);
4795 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4796 da_pdlist);
4797 }
4798 }
4799 }
4800 /*
4801 * If there were any rollbacks in the directory, then it must be
4802 * marked dirty so that its will eventually get written back in
4803 * its correct form.
4804 */
4805 if (chgs) {
4806 if ((bp->b_flags & B_DELWRI) == 0)
4807 stat_dir_entry++;
4808 bdirty(bp);
4809 return (1);
4810 }
4811 /*
4812 * If we are not waiting for a new directory block to be
4813 * claimed by its inode, then the pagedep will be freed.
4814 * Otherwise it will remain to track any new entries on
4815 * the page in case they are fsync'ed.
4816 */
4817 if ((pagedep->pd_state & NEWBLOCK) == 0) {
4818 LIST_REMOVE(pagedep, pd_hash);
4819 WORKITEM_FREE(pagedep, D_PAGEDEP);
4820 }
4821 return (0);
4822 }
4823
4824 /*
4825 * Writing back in-core inode structures.
4826 *
4827 * The filesystem only accesses an inode's contents when it occupies an
4828 * "in-core" inode structure. These "in-core" structures are separate from
4829 * the page frames used to cache inode blocks. Only the latter are
4830 * transferred to/from the disk. So, when the updated contents of the
4831 * "in-core" inode structure are copied to the corresponding in-memory inode
4832 * block, the dependencies are also transferred. The following procedure is
4833 * called when copying a dirty "in-core" inode to a cached inode block.
4834 */
4835
4836 /*
4837 * Called when an inode is loaded from disk. If the effective link count
4838 * differed from the actual link count when it was last flushed, then we
4839 * need to ensure that the correct effective link count is put back.
4840 */
4841 void
4842 softdep_load_inodeblock(ip)
4843 struct inode *ip; /* the "in_core" copy of the inode */
4844 {
4845 struct inodedep *inodedep;
4846
4847 /*
4848 * Check for alternate nlink count.
4849 */
4850 ip->i_effnlink = ip->i_nlink;
4851 ACQUIRE_LOCK(&lk);
4852 if (inodedep_lookup(UFSTOVFS(ip->i_ump),
4853 ip->i_number, 0, &inodedep) == 0) {
4854 FREE_LOCK(&lk);
4855 return;
4856 }
4857 ip->i_effnlink -= inodedep->id_nlinkdelta;
4858 if (inodedep->id_state & SPACECOUNTED)
4859 ip->i_flag |= IN_SPACECOUNTED;
4860 FREE_LOCK(&lk);
4861 }
4862
4863 /*
4864 * This routine is called just before the "in-core" inode
4865 * information is to be copied to the in-memory inode block.
4866 * Recall that an inode block contains several inodes. If
4867 * the force flag is set, then the dependencies will be
4868 * cleared so that the update can always be made. Note that
4869 * the buffer is locked when this routine is called, so we
4870 * will never be in the middle of writing the inode block
4871 * to disk.
4872 */
4873 void
4874 softdep_update_inodeblock(ip, bp, waitfor)
4875 struct inode *ip; /* the "in_core" copy of the inode */
4876 struct buf *bp; /* the buffer containing the inode block */
4877 int waitfor; /* nonzero => update must be allowed */
4878 {
4879 struct inodedep *inodedep;
4880 struct worklist *wk;
4881 struct mount *mp;
4882 struct buf *ibp;
4883 int error;
4884
4885 /*
4886 * If the effective link count is not equal to the actual link
4887 * count, then we must track the difference in an inodedep while
4888 * the inode is (potentially) tossed out of the cache. Otherwise,
4889 * if there is no existing inodedep, then there are no dependencies
4890 * to track.
4891 */
4892 mp = UFSTOVFS(ip->i_ump);
4893 ACQUIRE_LOCK(&lk);
4894 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
4895 FREE_LOCK(&lk);
4896 if (ip->i_effnlink != ip->i_nlink)
4897 panic("softdep_update_inodeblock: bad link count");
4898 return;
4899 }
4900 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
4901 panic("softdep_update_inodeblock: bad delta");
4902 /*
4903 * Changes have been initiated. Anything depending on these
4904 * changes cannot occur until this inode has been written.
4905 */
4906 inodedep->id_state &= ~COMPLETE;
4907 if ((inodedep->id_state & ONWORKLIST) == 0)
4908 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4909 /*
4910 * Any new dependencies associated with the incore inode must
4911 * now be moved to the list associated with the buffer holding
4912 * the in-memory copy of the inode. Once merged process any
4913 * allocdirects that are completed by the merger.
4914 */
4915 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
4916 if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
4917 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4918 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
4919 if (!TAILQ_EMPTY(&inodedep->id_extupdt))
4920 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
4921 /*
4922 * Now that the inode has been pushed into the buffer, the
4923 * operations dependent on the inode being written to disk
4924 * can be moved to the id_bufwait so that they will be
4925 * processed when the buffer I/O completes.
4926 */
4927 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4928 WORKLIST_REMOVE(wk);
4929 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4930 }
4931 /*
4932 * Newly allocated inodes cannot be written until the bitmap
4933 * that allocates them have been written (indicated by
4934 * DEPCOMPLETE being set in id_state). If we are doing a
4935 * forced sync (e.g., an fsync on a file), we force the bitmap
4936 * to be written so that the update can be done.
4937 */
4938 if (waitfor == 0) {
4939 FREE_LOCK(&lk);
4940 return;
4941 }
4942 retry:
4943 if ((inodedep->id_state & DEPCOMPLETE) != 0) {
4944 FREE_LOCK(&lk);
4945 return;
4946 }
4947 ibp = inodedep->id_buf;
4948 ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
4949 if (ibp == NULL) {
4950 /*
4951 * If ibp came back as NULL, the dependency could have been
4952 * freed while we slept. Look it up again, and check to see
4953 * that it has completed.
4954 */
4955 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
4956 goto retry;
4957 FREE_LOCK(&lk);
4958 return;
4959 }
4960 FREE_LOCK(&lk);
4961 if ((error = bwrite(ibp)) != 0)
4962 softdep_error("softdep_update_inodeblock: bwrite", error);
4963 }
4964
4965 /*
4966 * Merge the a new inode dependency list (such as id_newinoupdt) into an
4967 * old inode dependency list (such as id_inoupdt). This routine must be
4968 * called with splbio interrupts blocked.
4969 */
4970 static void
4971 merge_inode_lists(newlisthead, oldlisthead)
4972 struct allocdirectlst *newlisthead;
4973 struct allocdirectlst *oldlisthead;
4974 {
4975 struct allocdirect *listadp, *newadp;
4976
4977 newadp = TAILQ_FIRST(newlisthead);
4978 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
4979 if (listadp->ad_lbn < newadp->ad_lbn) {
4980 listadp = TAILQ_NEXT(listadp, ad_next);
4981 continue;
4982 }
4983 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4984 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4985 if (listadp->ad_lbn == newadp->ad_lbn) {
4986 allocdirect_merge(oldlisthead, newadp,
4987 listadp);
4988 listadp = newadp;
4989 }
4990 newadp = TAILQ_FIRST(newlisthead);
4991 }
4992 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
4993 TAILQ_REMOVE(newlisthead, newadp, ad_next);
4994 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
4995 }
4996 }
4997
4998 /*
4999 * If we are doing an fsync, then we must ensure that any directory
5000 * entries for the inode have been written after the inode gets to disk.
5001 */
5002 int
5003 softdep_fsync(vp)
5004 struct vnode *vp; /* the "in_core" copy of the inode */
5005 {
5006 struct inodedep *inodedep;
5007 struct pagedep *pagedep;
5008 struct worklist *wk;
5009 struct diradd *dap;
5010 struct mount *mp;
5011 struct vnode *pvp;
5012 struct inode *ip;
5013 struct buf *bp;
5014 struct fs *fs;
5015 struct thread *td = curthread;
5016 int error, flushparent, pagedep_new_block;
5017 ino_t parentino;
5018 ufs_lbn_t lbn;
5019
5020 ip = VTOI(vp);
5021 fs = ip->i_fs;
5022 mp = vp->v_mount;
5023 ACQUIRE_LOCK(&lk);
5024 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
5025 FREE_LOCK(&lk);
5026 return (0);
5027 }
5028 if (!LIST_EMPTY(&inodedep->id_inowait) ||
5029 !LIST_EMPTY(&inodedep->id_bufwait) ||
5030 !TAILQ_EMPTY(&inodedep->id_extupdt) ||
5031 !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
5032 !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
5033 !TAILQ_EMPTY(&inodedep->id_newinoupdt))
5034 panic("softdep_fsync: pending ops");
5035 for (error = 0, flushparent = 0; ; ) {
5036 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
5037 break;
5038 if (wk->wk_type != D_DIRADD)
5039 panic("softdep_fsync: Unexpected type %s",
5040 TYPENAME(wk->wk_type));
5041 dap = WK_DIRADD(wk);
5042 /*
5043 * Flush our parent if this directory entry has a MKDIR_PARENT
5044 * dependency or is contained in a newly allocated block.
5045 */
5046 if (dap->da_state & DIRCHG)
5047 pagedep = dap->da_previous->dm_pagedep;
5048 else
5049 pagedep = dap->da_pagedep;
5050 parentino = pagedep->pd_ino;
5051 lbn = pagedep->pd_lbn;
5052 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
5053 panic("softdep_fsync: dirty");
5054 if ((dap->da_state & MKDIR_PARENT) ||
5055 (pagedep->pd_state & NEWBLOCK))
5056 flushparent = 1;
5057 else
5058 flushparent = 0;
5059 /*
5060 * If we are being fsync'ed as part of vgone'ing this vnode,
5061 * then we will not be able to release and recover the
5062 * vnode below, so we just have to give up on writing its
5063 * directory entry out. It will eventually be written, just
5064 * not now, but then the user was not asking to have it
5065 * written, so we are not breaking any promises.
5066 */
5067 if (vp->v_iflag & VI_DOOMED)
5068 break;
5069 /*
5070 * We prevent deadlock by always fetching inodes from the
5071 * root, moving down the directory tree. Thus, when fetching
5072 * our parent directory, we first try to get the lock. If
5073 * that fails, we must unlock ourselves before requesting
5074 * the lock on our parent. See the comment in ufs_lookup
5075 * for details on possible races.
5076 */
5077 FREE_LOCK(&lk);
5078 if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
5079 VOP_UNLOCK(vp, 0, td);
5080 error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
5081 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
5082 if (error != 0)
5083 return (error);
5084 }
5085 /*
5086 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
5087 * that are contained in direct blocks will be resolved by
5088 * doing a ffs_update. Pagedeps contained in indirect blocks
5089 * may require a complete sync'ing of the directory. So, we
5090 * try the cheap and fast ffs_update first, and if that fails,
5091 * then we do the slower ffs_syncvnode of the directory.
5092 */
5093 if (flushparent) {
5094 int locked;
5095
5096 if ((error = ffs_update(pvp, 1)) != 0) {
5097 vput(pvp);
5098 return (error);
5099 }
5100 ACQUIRE_LOCK(&lk);
5101 locked = 1;
5102 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
5103 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
5104 if (wk->wk_type != D_DIRADD)
5105 panic("softdep_fsync: Unexpected type %s",
5106 TYPENAME(wk->wk_type));
5107 dap = WK_DIRADD(wk);
5108 if (dap->da_state & DIRCHG)
5109 pagedep = dap->da_previous->dm_pagedep;
5110 else
5111 pagedep = dap->da_pagedep;
5112 pagedep_new_block = pagedep->pd_state & NEWBLOCK;
5113 FREE_LOCK(&lk);
5114 locked = 0;
5115 if (pagedep_new_block &&
5116 (error = ffs_syncvnode(pvp, MNT_WAIT))) {
5117 vput(pvp);
5118 return (error);
5119 }
5120 }
5121 }
5122 if (locked)
5123 FREE_LOCK(&lk);
5124 }
5125 /*
5126 * Flush directory page containing the inode's name.
5127 */
5128 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
5129 &bp);
5130 if (error == 0)
5131 error = bwrite(bp);
5132 else
5133 brelse(bp);
5134 vput(pvp);
5135 if (error != 0)
5136 return (error);
5137 ACQUIRE_LOCK(&lk);
5138 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
5139 break;
5140 }
5141 FREE_LOCK(&lk);
5142 return (0);
5143 }
5144
5145 /*
5146 * Flush all the dirty bitmaps associated with the block device
5147 * before flushing the rest of the dirty blocks so as to reduce
5148 * the number of dependencies that will have to be rolled back.
5149 */
5150 void
5151 softdep_fsync_mountdev(vp)
5152 struct vnode *vp;
5153 {
5154 struct buf *bp, *nbp;
5155 struct worklist *wk;
5156
5157 if (!vn_isdisk(vp, NULL))
5158 panic("softdep_fsync_mountdev: vnode not a disk");
5159 restart:
5160 ACQUIRE_LOCK(&lk);
5161 VI_LOCK(vp);
5162 TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
5163 /*
5164 * If it is already scheduled, skip to the next buffer.
5165 */
5166 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
5167 continue;
5168
5169 if ((bp->b_flags & B_DELWRI) == 0)
5170 panic("softdep_fsync_mountdev: not dirty");
5171 /*
5172 * We are only interested in bitmaps with outstanding
5173 * dependencies.
5174 */
5175 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
5176 wk->wk_type != D_BMSAFEMAP ||
5177 (bp->b_vflags & BV_BKGRDINPROG)) {
5178 BUF_UNLOCK(bp);
5179 continue;
5180 }
5181 VI_UNLOCK(vp);
5182 FREE_LOCK(&lk);
5183 bremfree(bp);
5184 (void) bawrite(bp);
5185 goto restart;
5186 }
5187 FREE_LOCK(&lk);
5188 drain_output(vp);
5189 VI_UNLOCK(vp);
5190 }
5191
5192 /*
5193 * This routine is called when we are trying to synchronously flush a
5194 * file. This routine must eliminate any filesystem metadata dependencies
5195 * so that the syncing routine can succeed by pushing the dirty blocks
5196 * associated with the file. If any I/O errors occur, they are returned.
5197 */
5198 int
5199 softdep_sync_metadata(struct vnode *vp)
5200 {
5201 struct pagedep *pagedep;
5202 struct allocdirect *adp;
5203 struct allocindir *aip;
5204 struct buf *bp, *nbp;
5205 struct worklist *wk;
5206 int i, error, waitfor;
5207
5208 if (!DOINGSOFTDEP(vp))
5209 return (0);
5210 /*
5211 * Ensure that any direct block dependencies have been cleared.
5212 */
5213 ACQUIRE_LOCK(&lk);
5214 if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
5215 FREE_LOCK(&lk);
5216 return (error);
5217 }
5218 FREE_LOCK(&lk);
5219 /*
5220 * For most files, the only metadata dependencies are the
5221 * cylinder group maps that allocate their inode or blocks.
5222 * The block allocation dependencies can be found by traversing
5223 * the dependency lists for any buffers that remain on their
5224 * dirty buffer list. The inode allocation dependency will
5225 * be resolved when the inode is updated with MNT_WAIT.
5226 * This work is done in two passes. The first pass grabs most
5227 * of the buffers and begins asynchronously writing them. The
5228 * only way to wait for these asynchronous writes is to sleep
5229 * on the filesystem vnode which may stay busy for a long time
5230 * if the filesystem is active. So, instead, we make a second
5231 * pass over the dependencies blocking on each write. In the
5232 * usual case we will be blocking against a write that we
5233 * initiated, so when it is done the dependency will have been
5234 * resolved. Thus the second pass is expected to end quickly.
5235 */
5236 waitfor = MNT_NOWAIT;
5237
5238 top:
5239 /*
5240 * We must wait for any I/O in progress to finish so that
5241 * all potential buffers on the dirty list will be visible.
5242 */
5243 VI_LOCK(vp);
5244 drain_output(vp);
5245 while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
5246 bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
5247 if (bp)
5248 break;
5249 }
5250 VI_UNLOCK(vp);
5251 if (bp == NULL)
5252 return (0);
5253 loop:
5254 /* While syncing snapshots, we must allow recursive lookups */
5255 bp->b_lock.lk_flags |= LK_CANRECURSE;
5256 ACQUIRE_LOCK(&lk);
5257 /*
5258 * As we hold the buffer locked, none of its dependencies
5259 * will disappear.
5260 */
5261 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5262 switch (wk->wk_type) {
5263
5264 case D_ALLOCDIRECT:
5265 adp = WK_ALLOCDIRECT(wk);
5266 if (adp->ad_state & DEPCOMPLETE)
5267 continue;
5268 nbp = adp->ad_buf;
5269 nbp = getdirtybuf(nbp, &lk, waitfor);
5270 if (nbp == NULL)
5271 continue;
5272 FREE_LOCK(&lk);
5273 if (waitfor == MNT_NOWAIT) {
5274 bawrite(nbp);
5275 } else if ((error = bwrite(nbp)) != 0) {
5276 break;
5277 }
5278 ACQUIRE_LOCK(&lk);
5279 continue;
5280
5281 case D_ALLOCINDIR:
5282 aip = WK_ALLOCINDIR(wk);
5283 if (aip->ai_state & DEPCOMPLETE)
5284 continue;
5285 nbp = aip->ai_buf;
5286 nbp = getdirtybuf(nbp, &lk, waitfor);
5287 if (nbp == NULL)
5288 continue;
5289 FREE_LOCK(&lk);
5290 if (waitfor == MNT_NOWAIT) {
5291 bawrite(nbp);
5292 } else if ((error = bwrite(nbp)) != 0) {
5293 break;
5294 }
5295 ACQUIRE_LOCK(&lk);
5296 continue;
5297
5298 case D_INDIRDEP:
5299 restart:
5300
5301 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
5302 if (aip->ai_state & DEPCOMPLETE)
5303 continue;
5304 nbp = aip->ai_buf;
5305 nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
5306 if (nbp == NULL)
5307 goto restart;
5308 FREE_LOCK(&lk);
5309 if ((error = bwrite(nbp)) != 0) {
5310 goto loop_end;
5311 }
5312 ACQUIRE_LOCK(&lk);
5313 goto restart;
5314 }
5315 continue;
5316
5317 case D_INODEDEP:
5318 if ((error = flush_inodedep_deps(wk->wk_mp,
5319 WK_INODEDEP(wk)->id_ino)) != 0) {
5320 FREE_LOCK(&lk);
5321 break;
5322 }
5323 continue;
5324
5325 case D_PAGEDEP:
5326 /*
5327 * We are trying to sync a directory that may
5328 * have dependencies on both its own metadata
5329 * and/or dependencies on the inodes of any
5330 * recently allocated files. We walk its diradd
5331 * lists pushing out the associated inode.
5332 */
5333 pagedep = WK_PAGEDEP(wk);
5334 for (i = 0; i < DAHASHSZ; i++) {
5335 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
5336 continue;
5337 if ((error =
5338 flush_pagedep_deps(vp, wk->wk_mp,
5339 &pagedep->pd_diraddhd[i]))) {
5340 FREE_LOCK(&lk);
5341 goto loop_end;
5342 }
5343 }
5344 continue;
5345
5346 case D_MKDIR:
5347 /*
5348 * This case should never happen if the vnode has
5349 * been properly sync'ed. However, if this function
5350 * is used at a place where the vnode has not yet
5351 * been sync'ed, this dependency can show up. So,
5352 * rather than panic, just flush it.
5353 */
5354 nbp = WK_MKDIR(wk)->md_buf;
5355 nbp = getdirtybuf(nbp, &lk, waitfor);
5356 if (nbp == NULL)
5357 continue;
5358 FREE_LOCK(&lk);
5359 if (waitfor == MNT_NOWAIT) {
5360 bawrite(nbp);
5361 } else if ((error = bwrite(nbp)) != 0) {
5362 break;
5363 }
5364 ACQUIRE_LOCK(&lk);
5365 continue;
5366
5367 case D_BMSAFEMAP:
5368 /*
5369 * This case should never happen if the vnode has
5370 * been properly sync'ed. However, if this function
5371 * is used at a place where the vnode has not yet
5372 * been sync'ed, this dependency can show up. So,
5373 * rather than panic, just flush it.
5374 */
5375 nbp = WK_BMSAFEMAP(wk)->sm_buf;
5376 nbp = getdirtybuf(nbp, &lk, waitfor);
5377 if (nbp == NULL)
5378 continue;
5379 FREE_LOCK(&lk);
5380 if (waitfor == MNT_NOWAIT) {
5381 bawrite(nbp);
5382 } else if ((error = bwrite(nbp)) != 0) {
5383 break;
5384 }
5385 ACQUIRE_LOCK(&lk);
5386 continue;
5387
5388 default:
5389 panic("softdep_sync_metadata: Unknown type %s",
5390 TYPENAME(wk->wk_type));
5391 /* NOTREACHED */
5392 }
5393 loop_end:
5394 /* We reach here only in error and unlocked */
5395 if (error == 0)
5396 panic("softdep_sync_metadata: zero error");
5397 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5398 bawrite(bp);
5399 return (error);
5400 }
5401 FREE_LOCK(&lk);
5402 VI_LOCK(vp);
5403 while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
5404 nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
5405 if (nbp)
5406 break;
5407 }
5408 VI_UNLOCK(vp);
5409 bp->b_lock.lk_flags &= ~LK_CANRECURSE;
5410 bawrite(bp);
5411 if (nbp != NULL) {
5412 bp = nbp;
5413 goto loop;
5414 }
5415 /*
5416 * The brief unlock is to allow any pent up dependency
5417 * processing to be done. Then proceed with the second pass.
5418 */
5419 if (waitfor == MNT_NOWAIT) {
5420 waitfor = MNT_WAIT;
5421 goto top;
5422 }
5423
5424 /*
5425 * If we have managed to get rid of all the dirty buffers,
5426 * then we are done. For certain directories and block
5427 * devices, we may need to do further work.
5428 *
5429 * We must wait for any I/O in progress to finish so that
5430 * all potential buffers on the dirty list will be visible.
5431 */
5432 VI_LOCK(vp);
5433 drain_output(vp);
5434 VI_UNLOCK(vp);
5435 return (0);
5436 }
5437
5438 /*
5439 * Flush the dependencies associated with an inodedep.
5440 * Called with splbio blocked.
5441 */
5442 static int
5443 flush_inodedep_deps(mp, ino)
5444 struct mount *mp;
5445 ino_t ino;
5446 {
5447 struct inodedep *inodedep;
5448 int error, waitfor;
5449
5450 /*
5451 * This work is done in two passes. The first pass grabs most
5452 * of the buffers and begins asynchronously writing them. The
5453 * only way to wait for these asynchronous writes is to sleep
5454 * on the filesystem vnode which may stay busy for a long time
5455 * if the filesystem is active. So, instead, we make a second
5456 * pass over the dependencies blocking on each write. In the
5457 * usual case we will be blocking against a write that we
5458 * initiated, so when it is done the dependency will have been
5459 * resolved. Thus the second pass is expected to end quickly.
5460 * We give a brief window at the top of the loop to allow
5461 * any pending I/O to complete.
5462 */
5463 for (error = 0, waitfor = MNT_NOWAIT; ; ) {
5464 if (error)
5465 return (error);
5466 FREE_LOCK(&lk);
5467 ACQUIRE_LOCK(&lk);
5468 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5469 return (0);
5470 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
5471 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
5472 flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
5473 flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
5474 continue;
5475 /*
5476 * If pass2, we are done, otherwise do pass 2.
5477 */
5478 if (waitfor == MNT_WAIT)
5479 break;
5480 waitfor = MNT_WAIT;
5481 }
5482 /*
5483 * Try freeing inodedep in case all dependencies have been removed.
5484 */
5485 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
5486 (void) free_inodedep(inodedep);
5487 return (0);
5488 }
5489
5490 /*
5491 * Flush an inode dependency list.
5492 * Called with splbio blocked.
5493 */
5494 static int
5495 flush_deplist(listhead, waitfor, errorp)
5496 struct allocdirectlst *listhead;
5497 int waitfor;
5498 int *errorp;
5499 {
5500 struct allocdirect *adp;
5501 struct buf *bp;
5502
5503 mtx_assert(&lk, MA_OWNED);
5504 TAILQ_FOREACH(adp, listhead, ad_next) {
5505 if (adp->ad_state & DEPCOMPLETE)
5506 continue;
5507 bp = adp->ad_buf;
5508 bp = getdirtybuf(bp, &lk, waitfor);
5509 if (bp == NULL) {
5510 if (waitfor == MNT_NOWAIT)
5511 continue;
5512 return (1);
5513 }
5514 FREE_LOCK(&lk);
5515 if (waitfor == MNT_NOWAIT) {
5516 bawrite(bp);
5517 } else if ((*errorp = bwrite(bp)) != 0) {
5518 ACQUIRE_LOCK(&lk);
5519 return (1);
5520 }
5521 ACQUIRE_LOCK(&lk);
5522 return (1);
5523 }
5524 return (0);
5525 }
5526
5527 /*
5528 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5529 * Called with splbio blocked.
5530 */
5531 static int
5532 flush_pagedep_deps(pvp, mp, diraddhdp)
5533 struct vnode *pvp;
5534 struct mount *mp;
5535 struct diraddhd *diraddhdp;
5536 {
5537 struct inodedep *inodedep;
5538 struct ufsmount *ump;
5539 struct diradd *dap;
5540 struct vnode *vp;
5541 int error = 0;
5542 struct buf *bp;
5543 ino_t inum;
5544 struct worklist *wk;
5545
5546 ump = VFSTOUFS(mp);
5547 while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5548 /*
5549 * Flush ourselves if this directory entry
5550 * has a MKDIR_PARENT dependency.
5551 */
5552 if (dap->da_state & MKDIR_PARENT) {
5553 FREE_LOCK(&lk);
5554 if ((error = ffs_update(pvp, 1)) != 0)
5555 break;
5556 ACQUIRE_LOCK(&lk);
5557 /*
5558 * If that cleared dependencies, go on to next.
5559 */
5560 if (dap != LIST_FIRST(diraddhdp))
5561 continue;
5562 if (dap->da_state & MKDIR_PARENT)
5563 panic("flush_pagedep_deps: MKDIR_PARENT");
5564 }
5565 /*
5566 * A newly allocated directory must have its "." and
5567 * ".." entries written out before its name can be
5568 * committed in its parent. We do not want or need
5569 * the full semantics of a synchronous ffs_syncvnode as
5570 * that may end up here again, once for each directory
5571 * level in the filesystem. Instead, we push the blocks
5572 * and wait for them to clear. We have to fsync twice
5573 * because the first call may choose to defer blocks
5574 * that still have dependencies, but deferral will
5575 * happen at most once.
5576 */
5577 inum = dap->da_newinum;
5578 if (dap->da_state & MKDIR_BODY) {
5579 FREE_LOCK(&lk);
5580 if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
5581 break;
5582 if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
5583 (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
5584 vput(vp);
5585 break;
5586 }
5587 VI_LOCK(vp);
5588 drain_output(vp);
5589 /*
5590 * If first block is still dirty with a D_MKDIR
5591 * dependency then it needs to be written now.
5592 */
5593 for (;;) {
5594 error = 0;
5595 bp = gbincore(&vp->v_bufobj, 0);
5596 if (bp == NULL)
5597 break; /* First block not present */
5598 error = BUF_LOCK(bp,
5599 LK_EXCLUSIVE |
5600 LK_SLEEPFAIL |
5601 LK_INTERLOCK,
5602 VI_MTX(vp));
5603 VI_LOCK(vp);
5604 if (error == ENOLCK)
5605 continue; /* Slept, retry */
5606 if (error != 0)
5607 break; /* Failed */
5608 if ((bp->b_flags & B_DELWRI) == 0) {
5609 BUF_UNLOCK(bp);
5610 break; /* Buffer not dirty */
5611 }
5612 for (wk = LIST_FIRST(&bp->b_dep);
5613 wk != NULL;
5614 wk = LIST_NEXT(wk, wk_list))
5615 if (wk->wk_type == D_MKDIR)
5616 break;
5617 if (wk == NULL)
5618 BUF_UNLOCK(bp); /* Dependency gone */
5619 else {
5620 /*
5621 * D_MKDIR dependency remains,
5622 * must write buffer to stable
5623 * storage.
5624 */
5625 VI_UNLOCK(vp);
5626 bremfree(bp);
5627 error = bwrite(bp);
5628 VI_LOCK(vp);
5629 }
5630 break;
5631 }
5632 VI_UNLOCK(vp);
5633 vput(vp);
5634 if (error != 0)
5635 break; /* Flushing of first block failed */
5636 ACQUIRE_LOCK(&lk);
5637 /*
5638 * If that cleared dependencies, go on to next.
5639 */
5640 if (dap != LIST_FIRST(diraddhdp))
5641 continue;
5642 if (dap->da_state & MKDIR_BODY)
5643 panic("flush_pagedep_deps: MKDIR_BODY");
5644 }
5645 /*
5646 * Flush the inode on which the directory entry depends.
5647 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5648 * the only remaining dependency is that the updated inode
5649 * count must get pushed to disk. The inode has already
5650 * been pushed into its inode buffer (via VOP_UPDATE) at
5651 * the time of the reference count change. So we need only
5652 * locate that buffer, ensure that there will be no rollback
5653 * caused by a bitmap dependency, then write the inode buffer.
5654 */
5655 retry:
5656 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
5657 panic("flush_pagedep_deps: lost inode");
5658 /*
5659 * If the inode still has bitmap dependencies,
5660 * push them to disk.
5661 */
5662 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5663 bp = inodedep->id_buf;
5664 bp = getdirtybuf(bp, &lk, MNT_WAIT);
5665 if (bp == NULL)
5666 goto retry;
5667 FREE_LOCK(&lk);
5668 if ((error = bwrite(bp)) != 0)
5669 break;
5670 ACQUIRE_LOCK(&lk);
5671 if (dap != LIST_FIRST(diraddhdp))
5672 continue;
5673 }
5674 /*
5675 * If the inode is still sitting in a buffer waiting
5676 * to be written, push it to disk.
5677 */
5678 FREE_LOCK(&lk);
5679 if ((error = bread(ump->um_devvp,
5680 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5681 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5682 brelse(bp);
5683 break;
5684 }
5685 if ((error = bwrite(bp)) != 0)
5686 break;
5687 ACQUIRE_LOCK(&lk);
5688 /*
5689 * If we have failed to get rid of all the dependencies
5690 * then something is seriously wrong.
5691 */
5692 if (dap == LIST_FIRST(diraddhdp))
5693 panic("flush_pagedep_deps: flush failed");
5694 }
5695 if (error)
5696 ACQUIRE_LOCK(&lk);
5697 return (error);
5698 }
5699
5700 /*
5701 * A large burst of file addition or deletion activity can drive the
5702 * memory load excessively high. First attempt to slow things down
5703 * using the techniques below. If that fails, this routine requests
5704 * the offending operations to fall back to running synchronously
5705 * until the memory load returns to a reasonable level.
5706 */
5707 int
5708 softdep_slowdown(vp)
5709 struct vnode *vp;
5710 {
5711 int max_softdeps_hard;
5712
5713 ACQUIRE_LOCK(&lk);
5714 max_softdeps_hard = max_softdeps * 11 / 10;
5715 if (num_dirrem < max_softdeps_hard / 2 &&
5716 num_inodedep < max_softdeps_hard &&
5717 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps &&
5718 num_freeblkdep < max_softdeps_hard) {
5719 FREE_LOCK(&lk);
5720 return (0);
5721 }
5722 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
5723 softdep_speedup();
5724 stat_sync_limit_hit += 1;
5725 FREE_LOCK(&lk);
5726 return (1);
5727 }
5728
5729 /*
5730 * Called by the allocation routines when they are about to fail
5731 * in the hope that we can free up some disk space.
5732 *
5733 * First check to see if the work list has anything on it. If it has,
5734 * clean up entries until we successfully free some space. Because this
5735 * process holds inodes locked, we cannot handle any remove requests
5736 * that might block on a locked inode as that could lead to deadlock.
5737 * If the worklist yields no free space, encourage the syncer daemon
5738 * to help us. In no event will we try for longer than tickdelay seconds.
5739 */
5740 int
5741 softdep_request_cleanup(fs, vp)
5742 struct fs *fs;
5743 struct vnode *vp;
5744 {
5745 struct ufsmount *ump;
5746 long starttime;
5747 ufs2_daddr_t needed;
5748 int error;
5749
5750 ump = VTOI(vp)->i_ump;
5751 mtx_assert(UFS_MTX(ump), MA_OWNED);
5752 needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
5753 starttime = time_second + tickdelay;
5754 /*
5755 * If we are being called because of a process doing a
5756 * copy-on-write, then it is not safe to update the vnode
5757 * as we may recurse into the copy-on-write routine.
5758 */
5759 if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
5760 UFS_UNLOCK(ump);
5761 error = ffs_update(vp, 1);
5762 UFS_LOCK(ump);
5763 if (error != 0)
5764 return (0);
5765 }
5766 while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
5767 if (time_second > starttime)
5768 return (0);
5769 UFS_UNLOCK(ump);
5770 ACQUIRE_LOCK(&lk);
5771 if (ump->softdep_on_worklist > 0 &&
5772 process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
5773 stat_worklist_push += 1;
5774 FREE_LOCK(&lk);
5775 UFS_LOCK(ump);
5776 continue;
5777 }
5778 request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
5779 FREE_LOCK(&lk);
5780 UFS_LOCK(ump);
5781 }
5782 return (1);
5783 }
5784
5785 /*
5786 * If memory utilization has gotten too high, deliberately slow things
5787 * down and speed up the I/O processing.
5788 */
5789 extern struct thread *syncertd;
5790 static int
5791 request_cleanup(mp, resource)
5792 struct mount *mp;
5793 int resource;
5794 {
5795 struct thread *td = curthread;
5796 struct ufsmount *ump;
5797
5798 mtx_assert(&lk, MA_OWNED);
5799 /*
5800 * We never hold up the filesystem syncer or buf daemon.
5801 */
5802 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
5803 return (0);
5804 ump = VFSTOUFS(mp);
5805 /*
5806 * First check to see if the work list has gotten backlogged.
5807 * If it has, co-opt this process to help clean up two entries.
5808 * Because this process may hold inodes locked, we cannot
5809 * handle any remove requests that might block on a locked
5810 * inode as that could lead to deadlock. We set TDP_SOFTDEP
5811 * to avoid recursively processing the worklist.
5812 */
5813 if (ump->softdep_on_worklist > max_softdeps / 10) {
5814 td->td_pflags |= TDP_SOFTDEP;
5815 process_worklist_item(mp, LK_NOWAIT);
5816 process_worklist_item(mp, LK_NOWAIT);
5817 td->td_pflags &= ~TDP_SOFTDEP;
5818 stat_worklist_push += 2;
5819 return(1);
5820 }
5821 /*
5822 * Next, we attempt to speed up the syncer process. If that
5823 * is successful, then we allow the process to continue.
5824 */
5825 if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
5826 return(0);
5827 /*
5828 * If we are resource constrained on inode dependencies, try
5829 * flushing some dirty inodes. Otherwise, we are constrained
5830 * by file deletions, so try accelerating flushes of directories
5831 * with removal dependencies. We would like to do the cleanup
5832 * here, but we probably hold an inode locked at this point and
5833 * that might deadlock against one that we try to clean. So,
5834 * the best that we can do is request the syncer daemon to do
5835 * the cleanup for us.
5836 */
5837 switch (resource) {
5838
5839 case FLUSH_INODES:
5840 stat_ino_limit_push += 1;
5841 req_clear_inodedeps += 1;
5842 stat_countp = &stat_ino_limit_hit;
5843 break;
5844
5845 case FLUSH_REMOVE:
5846 case FLUSH_REMOVE_WAIT:
5847 stat_blk_limit_push += 1;
5848 req_clear_remove += 1;
5849 stat_countp = &stat_blk_limit_hit;
5850 break;
5851
5852 default:
5853 panic("request_cleanup: unknown type");
5854 }
5855 /*
5856 * Hopefully the syncer daemon will catch up and awaken us.
5857 * We wait at most tickdelay before proceeding in any case.
5858 */
5859 proc_waiting += 1;
5860 if (handle.callout == NULL)
5861 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5862 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
5863 proc_waiting -= 1;
5864 return (1);
5865 }
5866
5867 /*
5868 * Awaken processes pausing in request_cleanup and clear proc_waiting
5869 * to indicate that there is no longer a timer running.
5870 */
5871 static void
5872 pause_timer(arg)
5873 void *arg;
5874 {
5875
5876 ACQUIRE_LOCK(&lk);
5877 *stat_countp += 1;
5878 wakeup_one(&proc_waiting);
5879 if (proc_waiting > 0)
5880 handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
5881 else
5882 handle.callout = NULL;
5883 FREE_LOCK(&lk);
5884 }
5885
5886 /*
5887 * Flush out a directory with at least one removal dependency in an effort to
5888 * reduce the number of dirrem, freefile, and freeblks dependency structures.
5889 */
5890 static void
5891 clear_remove(td)
5892 struct thread *td;
5893 {
5894 struct pagedep_hashhead *pagedephd;
5895 struct pagedep *pagedep;
5896 static int next = 0;
5897 struct mount *mp;
5898 struct vnode *vp;
5899 int error, cnt;
5900 ino_t ino;
5901
5902 mtx_assert(&lk, MA_OWNED);
5903
5904 for (cnt = 0; cnt < pagedep_hash; cnt++) {
5905 pagedephd = &pagedep_hashtbl[next++];
5906 if (next >= pagedep_hash)
5907 next = 0;
5908 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5909 if (LIST_EMPTY(&pagedep->pd_dirremhd))
5910 continue;
5911 mp = pagedep->pd_list.wk_mp;
5912 ino = pagedep->pd_ino;
5913 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5914 continue;
5915 FREE_LOCK(&lk);
5916 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
5917 softdep_error("clear_remove: vget", error);
5918 vn_finished_write(mp);
5919 ACQUIRE_LOCK(&lk);
5920 return;
5921 }
5922 if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5923 softdep_error("clear_remove: fsync", error);
5924 VI_LOCK(vp);
5925 drain_output(vp);
5926 VI_UNLOCK(vp);
5927 vput(vp);
5928 vn_finished_write(mp);
5929 ACQUIRE_LOCK(&lk);
5930 return;
5931 }
5932 }
5933 }
5934
5935 /*
5936 * Clear out a block of dirty inodes in an effort to reduce
5937 * the number of inodedep dependency structures.
5938 */
5939 static void
5940 clear_inodedeps(td)
5941 struct thread *td;
5942 {
5943 struct inodedep_hashhead *inodedephd;
5944 struct inodedep *inodedep;
5945 static int next = 0;
5946 struct mount *mp;
5947 struct vnode *vp;
5948 struct fs *fs;
5949 int error, cnt;
5950 ino_t firstino, lastino, ino;
5951
5952 mtx_assert(&lk, MA_OWNED);
5953 /*
5954 * Pick a random inode dependency to be cleared.
5955 * We will then gather up all the inodes in its block
5956 * that have dependencies and flush them out.
5957 */
5958 for (cnt = 0; cnt < inodedep_hash; cnt++) {
5959 inodedephd = &inodedep_hashtbl[next++];
5960 if (next >= inodedep_hash)
5961 next = 0;
5962 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5963 break;
5964 }
5965 if (inodedep == NULL)
5966 return;
5967 fs = inodedep->id_fs;
5968 mp = inodedep->id_list.wk_mp;
5969 /*
5970 * Find the last inode in the block with dependencies.
5971 */
5972 firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5973 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5974 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
5975 break;
5976 /*
5977 * Asynchronously push all but the last inode with dependencies.
5978 * Synchronously push the last inode with dependencies to ensure
5979 * that the inode block gets written to free up the inodedeps.
5980 */
5981 for (ino = firstino; ino <= lastino; ino++) {
5982 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
5983 continue;
5984 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5985 continue;
5986 FREE_LOCK(&lk);
5987 if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
5988 softdep_error("clear_inodedeps: vget", error);
5989 vn_finished_write(mp);
5990 ACQUIRE_LOCK(&lk);
5991 return;
5992 }
5993 if (ino == lastino) {
5994 if ((error = ffs_syncvnode(vp, MNT_WAIT)))
5995 softdep_error("clear_inodedeps: fsync1", error);
5996 } else {
5997 if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
5998 softdep_error("clear_inodedeps: fsync2", error);
5999 VI_LOCK(vp);
6000 drain_output(vp);
6001 VI_UNLOCK(vp);
6002 }
6003 vput(vp);
6004 vn_finished_write(mp);
6005 ACQUIRE_LOCK(&lk);
6006 }
6007 }
6008
6009 /*
6010 * Function to determine if the buffer has outstanding dependencies
6011 * that will cause a roll-back if the buffer is written. If wantcount
6012 * is set, return number of dependencies, otherwise just yes or no.
6013 */
6014 static int
6015 softdep_count_dependencies(bp, wantcount)
6016 struct buf *bp;
6017 int wantcount;
6018 {
6019 struct worklist *wk;
6020 struct inodedep *inodedep;
6021 struct indirdep *indirdep;
6022 struct allocindir *aip;
6023 struct pagedep *pagedep;
6024 struct diradd *dap;
6025 int i, retval;
6026
6027 retval = 0;
6028 ACQUIRE_LOCK(&lk);
6029 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6030 switch (wk->wk_type) {
6031
6032 case D_INODEDEP:
6033 inodedep = WK_INODEDEP(wk);
6034 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
6035 /* bitmap allocation dependency */
6036 retval += 1;
6037 if (!wantcount)
6038 goto out;
6039 }
6040 if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
6041 /* direct block pointer dependency */
6042 retval += 1;
6043 if (!wantcount)
6044 goto out;
6045 }
6046 if (TAILQ_FIRST(&inodedep->id_extupdt)) {
6047 /* direct block pointer dependency */
6048 retval += 1;
6049 if (!wantcount)
6050 goto out;
6051 }
6052 continue;
6053
6054 case D_INDIRDEP:
6055 indirdep = WK_INDIRDEP(wk);
6056
6057 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
6058 /* indirect block pointer dependency */
6059 retval += 1;
6060 if (!wantcount)
6061 goto out;
6062 }
6063 continue;
6064
6065 case D_PAGEDEP:
6066 pagedep = WK_PAGEDEP(wk);
6067 for (i = 0; i < DAHASHSZ; i++) {
6068
6069 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
6070 /* directory entry dependency */
6071 retval += 1;
6072 if (!wantcount)
6073 goto out;
6074 }
6075 }
6076 continue;
6077
6078 case D_BMSAFEMAP:
6079 case D_ALLOCDIRECT:
6080 case D_ALLOCINDIR:
6081 case D_MKDIR:
6082 /* never a dependency on these blocks */
6083 continue;
6084
6085 default:
6086 panic("softdep_check_for_rollback: Unexpected type %s",
6087 TYPENAME(wk->wk_type));
6088 /* NOTREACHED */
6089 }
6090 }
6091 out:
6092 FREE_LOCK(&lk);
6093 return retval;
6094 }
6095
6096 /*
6097 * Acquire exclusive access to a buffer.
6098 * Must be called with a locked mtx parameter.
6099 * Return acquired buffer or NULL on failure.
6100 */
6101 static struct buf *
6102 getdirtybuf(bp, mtx, waitfor)
6103 struct buf *bp;
6104 struct mtx *mtx;
6105 int waitfor;
6106 {
6107 int error;
6108
6109 mtx_assert(mtx, MA_OWNED);
6110 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
6111 if (waitfor != MNT_WAIT)
6112 return (NULL);
6113 error = BUF_LOCK(bp,
6114 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
6115 /*
6116 * Even if we sucessfully acquire bp here, we have dropped
6117 * mtx, which may violates our guarantee.
6118 */
6119 if (error == 0)
6120 BUF_UNLOCK(bp);
6121 else if (error != ENOLCK)
6122 panic("getdirtybuf: inconsistent lock: %d", error);
6123 mtx_lock(mtx);
6124 return (NULL);
6125 }
6126 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6127 if (mtx == &lk && waitfor == MNT_WAIT) {
6128 mtx_unlock(mtx);
6129 BO_LOCK(bp->b_bufobj);
6130 BUF_UNLOCK(bp);
6131 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
6132 bp->b_vflags |= BV_BKGRDWAIT;
6133 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
6134 PRIBIO | PDROP, "getbuf", 0);
6135 } else
6136 BO_UNLOCK(bp->b_bufobj);
6137 mtx_lock(mtx);
6138 return (NULL);
6139 }
6140 BUF_UNLOCK(bp);
6141 if (waitfor != MNT_WAIT)
6142 return (NULL);
6143 /*
6144 * The mtx argument must be bp->b_vp's mutex in
6145 * this case.
6146 */
6147 #ifdef DEBUG_VFS_LOCKS
6148 if (bp->b_vp->v_type != VCHR)
6149 ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
6150 #endif
6151 bp->b_vflags |= BV_BKGRDWAIT;
6152 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
6153 return (NULL);
6154 }
6155 if ((bp->b_flags & B_DELWRI) == 0) {
6156 BUF_UNLOCK(bp);
6157 return (NULL);
6158 }
6159 bremfree(bp);
6160 return (bp);
6161 }
6162
6163
6164 /*
6165 * Check if it is safe to suspend the file system now. On entry,
6166 * the vnode interlock for devvp should be held. Return 0 with
6167 * the mount interlock held if the file system can be suspended now,
6168 * otherwise return EAGAIN with the mount interlock held.
6169 */
6170 int
6171 softdep_check_suspend(struct mount *mp,
6172 struct vnode *devvp,
6173 int softdep_deps,
6174 int softdep_accdeps,
6175 int secondary_writes,
6176 int secondary_accwrites)
6177 {
6178 struct bufobj *bo;
6179 struct ufsmount *ump;
6180 int error;
6181
6182 ASSERT_VI_LOCKED(devvp, "softdep_check_suspend");
6183 ump = VFSTOUFS(mp);
6184 bo = &devvp->v_bufobj;
6185
6186 for (;;) {
6187 if (!TRY_ACQUIRE_LOCK(&lk)) {
6188 VI_UNLOCK(devvp);
6189 ACQUIRE_LOCK(&lk);
6190 FREE_LOCK(&lk);
6191 VI_LOCK(devvp);
6192 continue;
6193 }
6194 if (!MNT_ITRYLOCK(mp)) {
6195 FREE_LOCK(&lk);
6196 VI_UNLOCK(devvp);
6197 MNT_ILOCK(mp);
6198 MNT_IUNLOCK(mp);
6199 VI_LOCK(devvp);
6200 continue;
6201 }
6202 if (mp->mnt_secondary_writes != 0) {
6203 FREE_LOCK(&lk);
6204 VI_UNLOCK(devvp);
6205 msleep(&mp->mnt_secondary_writes,
6206 MNT_MTX(mp),
6207 (PUSER - 1) | PDROP, "secwr", 0);
6208 VI_LOCK(devvp);
6209 continue;
6210 }
6211 break;
6212 }
6213
6214 /*
6215 * Reasons for needing more work before suspend:
6216 * - Dirty buffers on devvp.
6217 * - Softdep activity occurred after start of vnode sync loop
6218 * - Secondary writes occurred after start of vnode sync loop
6219 */
6220 error = 0;
6221 if (bo->bo_numoutput > 0 ||
6222 bo->bo_dirty.bv_cnt > 0 ||
6223 softdep_deps != 0 ||
6224 ump->softdep_deps != 0 ||
6225 softdep_accdeps != ump->softdep_accdeps ||
6226 secondary_writes != 0 ||
6227 mp->mnt_secondary_writes != 0 ||
6228 secondary_accwrites != mp->mnt_secondary_accwrites)
6229 error = EAGAIN;
6230 FREE_LOCK(&lk);
6231 VI_UNLOCK(devvp);
6232 return (error);
6233 }
6234
6235
6236 /*
6237 * Get the number of dependency structures for the file system, both
6238 * the current number and the total number allocated. These will
6239 * later be used to detect that softdep processing has occurred.
6240 */
6241 void
6242 softdep_get_depcounts(struct mount *mp,
6243 int *softdep_depsp,
6244 int *softdep_accdepsp)
6245 {
6246 struct ufsmount *ump;
6247
6248 ump = VFSTOUFS(mp);
6249 ACQUIRE_LOCK(&lk);
6250 *softdep_depsp = ump->softdep_deps;
6251 *softdep_accdepsp = ump->softdep_accdeps;
6252 FREE_LOCK(&lk);
6253 }
6254
6255 /*
6256 * Wait for pending output on a vnode to complete.
6257 * Must be called with vnode lock and interlock locked.
6258 *
6259 * XXX: Should just be a call to bufobj_wwait().
6260 */
6261 static void
6262 drain_output(vp)
6263 struct vnode *vp;
6264 {
6265 ASSERT_VOP_LOCKED(vp, "drain_output");
6266 ASSERT_VI_LOCKED(vp, "drain_output");
6267
6268 while (vp->v_bufobj.bo_numoutput) {
6269 vp->v_bufobj.bo_flag |= BO_WWAIT;
6270 msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
6271 VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
6272 }
6273 }
6274
6275 /*
6276 * Called whenever a buffer that is being invalidated or reallocated
6277 * contains dependencies. This should only happen if an I/O error has
6278 * occurred. The routine is called with the buffer locked.
6279 */
6280 static void
6281 softdep_deallocate_dependencies(bp)
6282 struct buf *bp;
6283 {
6284
6285 if ((bp->b_ioflags & BIO_ERROR) == 0)
6286 panic("softdep_deallocate_dependencies: dangling deps");
6287 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
6288 panic("softdep_deallocate_dependencies: unrecovered I/O error");
6289 }
6290
6291 /*
6292 * Function to handle asynchronous write errors in the filesystem.
6293 */
6294 static void
6295 softdep_error(func, error)
6296 char *func;
6297 int error;
6298 {
6299
6300 /* XXX should do something better! */
6301 printf("%s: got error %d while accessing filesystem\n", func, error);
6302 }
6303
6304 #endif /* SOFTUPDATES */
Cache object: 923727fe79bf4c836db7e24d9f3f79b6
|