FreeBSD/Linux Kernel Cross Reference
sys/fs/buffer.c
1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/module.h>
49 #include <linux/completion.h>
50
51 #include <asm/uaccess.h>
52 #include <asm/io.h>
53 #include <asm/bitops.h>
54 #include <asm/mmu_context.h>
55
56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
58 number of unused buffer heads */
59
60 /* Anti-deadlock ordering:
61 * lru_list_lock > hash_table_lock > unused_list_lock
62 */
63
64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
65
66 /*
67 * Hash table gook..
68 */
69 static unsigned int bh_hash_mask;
70 static unsigned int bh_hash_shift;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
73
74 static struct buffer_head *lru_list[NR_LIST];
75
76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77 #define lru_list_lock lru_list_lock_cacheline.lock
78
79 static int nr_buffers_type[NR_LIST];
80 static unsigned long size_buffers_type[NR_LIST];
81
82 static struct buffer_head * unused_list;
83 static int nr_unused_buffer_heads;
84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
86
87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
88 static int osync_buffers_list(struct list_head *);
89 static void __refile_buffer(struct buffer_head *);
90
91 /* This is used by some architectures to estimate available memory. */
92 atomic_t buffermem_pages = ATOMIC_INIT(0);
93
94 /* Here is the parameter block for the bdflush process. If you add or
95 * remove any of the parameters, make sure to update kernel/sysctl.c
96 * and the documentation at linux/Documentation/sysctl/vm.txt.
97 */
98
99 #define N_PARAM 9
100
101 /* The dummy values in this structure are left in there for compatibility
102 * with old programs that play with the /proc entries.
103 */
104 union bdflush_param {
105 struct {
106 int nfract; /* Percentage of buffer cache dirty to
107 activate bdflush */
108 int ndirty; /* Maximum number of dirty blocks to write out per
109 wake-cycle */
110 int dummy2; /* old "nrefill" */
111 int dummy3; /* unused */
112 int interval; /* jiffies delay between kupdate flushes */
113 int age_buffer; /* Time for normal buffer to age before we flush it */
114 int nfract_sync;/* Percentage of buffer cache dirty to
115 activate bdflush synchronously */
116 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
117 int dummy5; /* unused */
118 } b_un;
119 unsigned int data[N_PARAM];
120 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
121
122 /* These are the min and max parameter values that we will allow to be assigned */
123 int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0};
124 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
125
126 void unlock_buffer(struct buffer_head *bh)
127 {
128 clear_bit(BH_Wait_IO, &bh->b_state);
129 clear_bit(BH_Launder, &bh->b_state);
130 /*
131 * When a locked buffer is visible to the I/O layer BH_Launder
132 * is set. This means before unlocking we must clear BH_Launder,
133 * mb() on alpha and then clear BH_Lock, so no reader can see
134 * BH_Launder set on an unlocked buffer and then risk to deadlock.
135 */
136 smp_mb__after_clear_bit();
137 clear_bit(BH_Lock, &bh->b_state);
138 smp_mb__after_clear_bit();
139 if (waitqueue_active(&bh->b_wait))
140 wake_up(&bh->b_wait);
141 }
142
143 /*
144 * Note that the real wait_on_buffer() is an inline function that checks
145 * that the buffer is locked before calling this, so that unnecessary disk
146 * unplugging does not occur.
147 */
148 void __wait_on_buffer(struct buffer_head * bh)
149 {
150 struct task_struct *tsk = current;
151 DECLARE_WAITQUEUE(wait, tsk);
152
153 get_bh(bh);
154 add_wait_queue(&bh->b_wait, &wait);
155 do {
156 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
157 if (!buffer_locked(bh))
158 break;
159 /*
160 * We must read tq_disk in TQ_ACTIVE after the
161 * add_wait_queue effect is visible to other cpus.
162 * We could unplug some line above it wouldn't matter
163 * but we can't do that right after add_wait_queue
164 * without an smp_mb() in between because spin_unlock
165 * has inclusive semantics.
166 * Doing it here is the most efficient place so we
167 * don't do a suprious unplug if we get a racy
168 * wakeup that make buffer_locked to return 0, and
169 * doing it here avoids an explicit smp_mb() we
170 * rely on the implicit one in set_task_state.
171 */
172 run_task_queue(&tq_disk);
173 schedule();
174 } while (buffer_locked(bh));
175 tsk->state = TASK_RUNNING;
176 remove_wait_queue(&bh->b_wait, &wait);
177 put_bh(bh);
178 }
179
180 /*
181 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
182 * unlock the buffer. This is what ll_rw_block uses too.
183 */
184 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
185 {
186 mark_buffer_uptodate(bh, uptodate);
187 unlock_buffer(bh);
188 put_bh(bh);
189 }
190
191 /*
192 * The buffers have been marked clean and locked. Just submit the dang
193 * things..
194 */
195 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
196 {
197 do {
198 struct buffer_head * bh = *array++;
199 bh->b_end_io = end_buffer_io_sync;
200 submit_bh(WRITE, bh);
201 } while (--count);
202 }
203
204 /*
205 * Write some buffers from the head of the dirty queue.
206 *
207 * This must be called with the LRU lock held, and will
208 * return without it!
209 */
210 #define NRSYNC (32)
211 static int write_some_buffers(kdev_t dev)
212 {
213 struct buffer_head *next;
214 struct buffer_head *array[NRSYNC];
215 unsigned int count;
216 int nr;
217
218 next = lru_list[BUF_DIRTY];
219 nr = nr_buffers_type[BUF_DIRTY];
220 count = 0;
221 while (next && --nr >= 0) {
222 struct buffer_head * bh = next;
223 next = bh->b_next_free;
224
225 if (dev != NODEV && bh->b_dev != dev)
226 continue;
227 if (test_and_set_bit(BH_Lock, &bh->b_state))
228 continue;
229 if (atomic_set_buffer_clean(bh)) {
230 __refile_buffer(bh);
231 get_bh(bh);
232 array[count++] = bh;
233 if (count < NRSYNC)
234 continue;
235
236 spin_unlock(&lru_list_lock);
237 write_locked_buffers(array, count);
238 return -EAGAIN;
239 }
240 unlock_buffer(bh);
241 __refile_buffer(bh);
242 }
243 spin_unlock(&lru_list_lock);
244
245 if (count)
246 write_locked_buffers(array, count);
247 return 0;
248 }
249
250 /*
251 * Write out all buffers on the dirty list.
252 */
253 static void write_unlocked_buffers(kdev_t dev)
254 {
255 do
256 spin_lock(&lru_list_lock);
257 while (write_some_buffers(dev));
258 }
259
260 /*
261 * Wait for a buffer on the proper list.
262 *
263 * This must be called with the LRU lock held, and
264 * will return with it released.
265 */
266 static int wait_for_buffers(kdev_t dev, int index, int refile)
267 {
268 struct buffer_head * next;
269 int nr;
270
271 next = lru_list[index];
272 nr = nr_buffers_type[index];
273 while (next && --nr >= 0) {
274 struct buffer_head *bh = next;
275 next = bh->b_next_free;
276
277 if (!buffer_locked(bh)) {
278 if (refile)
279 __refile_buffer(bh);
280 continue;
281 }
282 if (dev != NODEV && bh->b_dev != dev)
283 continue;
284
285 get_bh(bh);
286 spin_unlock(&lru_list_lock);
287 wait_on_buffer (bh);
288 put_bh(bh);
289 return -EAGAIN;
290 }
291 spin_unlock(&lru_list_lock);
292 return 0;
293 }
294
295 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
296 {
297 do {
298 spin_lock(&lru_list_lock);
299 } while (wait_for_buffers(dev, index, refile));
300 return 0;
301 }
302
303 /* Call sync_buffers with wait!=0 to ensure that the call does not
304 * return until all buffer writes have completed. Sync() may return
305 * before the writes have finished; fsync() may not.
306 */
307
308 /* Godamity-damn. Some buffers (bitmaps for filesystems)
309 * spontaneously dirty themselves without ever brelse being called.
310 * We will ultimately want to put these in a separate list, but for
311 * now we search all of the lists for dirty buffers.
312 */
313 int sync_buffers(kdev_t dev, int wait)
314 {
315 int err = 0;
316
317 /* One pass for no-wait, three for wait:
318 * 0) write out all dirty, unlocked buffers;
319 * 1) wait for all dirty locked buffers;
320 * 2) write out all dirty, unlocked buffers;
321 * 2) wait for completion by waiting for all buffers to unlock.
322 */
323 write_unlocked_buffers(dev);
324 if (wait) {
325 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
326 write_unlocked_buffers(dev);
327 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
328 }
329 return err;
330 }
331
332 int fsync_super(struct super_block *sb)
333 {
334 kdev_t dev = sb->s_dev;
335 sync_buffers(dev, 0);
336
337 lock_kernel();
338 sync_inodes_sb(sb);
339 DQUOT_SYNC_SB(sb);
340 lock_super(sb);
341 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
342 sb->s_op->write_super(sb);
343 unlock_super(sb);
344 if (sb->s_op && sb->s_op->sync_fs)
345 sb->s_op->sync_fs(sb);
346 unlock_kernel();
347
348 return sync_buffers(dev, 1);
349 }
350
351 int fsync_no_super(kdev_t dev)
352 {
353 sync_buffers(dev, 0);
354 return sync_buffers(dev, 1);
355 }
356
357 int fsync_dev(kdev_t dev)
358 {
359 sync_buffers(dev, 0);
360
361 lock_kernel();
362 sync_inodes(dev);
363 DQUOT_SYNC_DEV(dev);
364 sync_supers(dev, 1);
365 unlock_kernel();
366
367 return sync_buffers(dev, 1);
368 }
369
370 /*
371 * There's no real reason to pretend we should
372 * ever do anything differently
373 */
374 void sync_dev(kdev_t dev)
375 {
376 fsync_dev(dev);
377 }
378
379 asmlinkage long sys_sync(void)
380 {
381 fsync_dev(0);
382 return 0;
383 }
384
385 /*
386 * filp may be NULL if called via the msync of a vma.
387 */
388
389 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
390 {
391 struct inode * inode = dentry->d_inode;
392 struct super_block * sb;
393 kdev_t dev;
394 int ret;
395
396 lock_kernel();
397 /* sync the inode to buffers */
398 write_inode_now(inode, 0);
399
400 /* sync the superblock to buffers */
401 sb = inode->i_sb;
402 lock_super(sb);
403 if (sb->s_op && sb->s_op->write_super)
404 sb->s_op->write_super(sb);
405 unlock_super(sb);
406
407 /* .. finally sync the buffers to disk */
408 dev = inode->i_dev;
409 ret = sync_buffers(dev, 1);
410 unlock_kernel();
411 return ret;
412 }
413
414 asmlinkage long sys_fsync(unsigned int fd)
415 {
416 struct file * file;
417 struct dentry * dentry;
418 struct inode * inode;
419 int ret, err;
420
421 ret = -EBADF;
422 file = fget(fd);
423 if (!file)
424 goto out;
425
426 dentry = file->f_dentry;
427 inode = dentry->d_inode;
428
429 ret = -EINVAL;
430 if (!file->f_op || !file->f_op->fsync) {
431 /* Why? We can still call filemap_fdatasync */
432 goto out_putf;
433 }
434
435 /* We need to protect against concurrent writers.. */
436 down(&inode->i_sem);
437 ret = filemap_fdatasync(inode->i_mapping);
438 err = file->f_op->fsync(file, dentry, 0);
439 if (err && !ret)
440 ret = err;
441 err = filemap_fdatawait(inode->i_mapping);
442 if (err && !ret)
443 ret = err;
444 up(&inode->i_sem);
445
446 out_putf:
447 fput(file);
448 out:
449 return ret;
450 }
451
452 int do_fdatasync(struct file *file)
453 {
454 int ret, err;
455 struct dentry *dentry;
456 struct inode *inode;
457
458 if (unlikely(!file->f_op || !file->f_op->fsync))
459 return -EINVAL;
460
461 dentry = file->f_dentry;
462 inode = dentry->d_inode;
463
464 ret = filemap_fdatasync(inode->i_mapping);
465 err = file->f_op->fsync(file, dentry, 1);
466 if (err && !ret)
467 ret = err;
468 err = filemap_fdatawait(inode->i_mapping);
469 if (err && !ret)
470 ret = err;
471 return ret;
472 }
473
474 asmlinkage long sys_fdatasync(unsigned int fd)
475 {
476 struct file * file;
477 struct inode *inode;
478 int ret;
479
480 ret = -EBADF;
481 file = fget(fd);
482 if (!file)
483 goto out;
484
485 inode = file->f_dentry->d_inode;
486 down(&inode->i_sem);
487 ret = do_fdatasync(file);
488 up(&inode->i_sem);
489
490 fput(file);
491 out:
492 return ret;
493 }
494
495 /* After several hours of tedious analysis, the following hash
496 * function won. Do not mess with it... -DaveM
497 */
498 #define _hashfn(dev,block) \
499 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
500 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
501 ((block) << (bh_hash_shift - 12))))
502 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
503
504 static inline void __insert_into_hash_list(struct buffer_head *bh)
505 {
506 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
507 struct buffer_head *next = *head;
508
509 *head = bh;
510 bh->b_pprev = head;
511 bh->b_next = next;
512 if (next != NULL)
513 next->b_pprev = &bh->b_next;
514 }
515
516 static __inline__ void __hash_unlink(struct buffer_head *bh)
517 {
518 struct buffer_head **pprev = bh->b_pprev;
519 if (pprev) {
520 struct buffer_head *next = bh->b_next;
521 if (next)
522 next->b_pprev = pprev;
523 *pprev = next;
524 bh->b_pprev = NULL;
525 }
526 }
527
528 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
529 {
530 struct buffer_head **bhp = &lru_list[blist];
531
532 if (bh->b_prev_free || bh->b_next_free) BUG();
533
534 if(!*bhp) {
535 *bhp = bh;
536 bh->b_prev_free = bh;
537 }
538 bh->b_next_free = *bhp;
539 bh->b_prev_free = (*bhp)->b_prev_free;
540 (*bhp)->b_prev_free->b_next_free = bh;
541 (*bhp)->b_prev_free = bh;
542 nr_buffers_type[blist]++;
543 size_buffers_type[blist] += bh->b_size;
544 }
545
546 static void __remove_from_lru_list(struct buffer_head * bh)
547 {
548 struct buffer_head *next = bh->b_next_free;
549 if (next) {
550 struct buffer_head *prev = bh->b_prev_free;
551 int blist = bh->b_list;
552
553 prev->b_next_free = next;
554 next->b_prev_free = prev;
555 if (lru_list[blist] == bh) {
556 if (next == bh)
557 next = NULL;
558 lru_list[blist] = next;
559 }
560 bh->b_next_free = NULL;
561 bh->b_prev_free = NULL;
562 nr_buffers_type[blist]--;
563 size_buffers_type[blist] -= bh->b_size;
564 }
565 }
566
567 /* must be called with both the hash_table_lock and the lru_list_lock
568 held */
569 static void __remove_from_queues(struct buffer_head *bh)
570 {
571 __hash_unlink(bh);
572 __remove_from_lru_list(bh);
573 }
574
575 static void remove_from_queues(struct buffer_head *bh)
576 {
577 spin_lock(&lru_list_lock);
578 write_lock(&hash_table_lock);
579 __remove_from_queues(bh);
580 write_unlock(&hash_table_lock);
581 spin_unlock(&lru_list_lock);
582 }
583
584 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
585 {
586 struct buffer_head *bh, **p = &hash(dev, block);
587
588 read_lock(&hash_table_lock);
589
590 for (;;) {
591 bh = *p;
592 if (!bh)
593 break;
594 p = &bh->b_next;
595 if (bh->b_blocknr != block)
596 continue;
597 if (bh->b_size != size)
598 continue;
599 if (bh->b_dev != dev)
600 continue;
601 get_bh(bh);
602 break;
603 }
604
605 read_unlock(&hash_table_lock);
606 return bh;
607 }
608
609 void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
610 {
611 spin_lock(&lru_list_lock);
612 if (buffer_attached(bh))
613 list_del(&bh->b_inode_buffers);
614 set_buffer_attached(bh);
615 list_add(&bh->b_inode_buffers, list);
616 spin_unlock(&lru_list_lock);
617 }
618
619 /*
620 * The caller must have the lru_list lock before calling the
621 * remove_inode_queue functions.
622 */
623 static void __remove_inode_queue(struct buffer_head *bh)
624 {
625 list_del(&bh->b_inode_buffers);
626 clear_buffer_attached(bh);
627 }
628
629 static inline void remove_inode_queue(struct buffer_head *bh)
630 {
631 if (buffer_attached(bh))
632 __remove_inode_queue(bh);
633 }
634
635 int inode_has_buffers(struct inode *inode)
636 {
637 int ret;
638
639 spin_lock(&lru_list_lock);
640 ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
641 spin_unlock(&lru_list_lock);
642
643 return ret;
644 }
645
646 /* If invalidate_buffers() will trash dirty buffers, it means some kind
647 of fs corruption is going on. Trashing dirty data always imply losing
648 information that was supposed to be just stored on the physical layer
649 by the user.
650
651 Thus invalidate_buffers in general usage is not allwowed to trash
652 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
653 be preserved. These buffers are simply skipped.
654
655 We also skip buffers which are still in use. For example this can
656 happen if a userspace program is reading the block device.
657
658 NOTE: In the case where the user removed a removable-media-disk even if
659 there's still dirty data not synced on disk (due a bug in the device driver
660 or due an error of the user), by not destroying the dirty buffers we could
661 generate corruption also on the next media inserted, thus a parameter is
662 necessary to handle this case in the most safe way possible (trying
663 to not corrupt also the new disk inserted with the data belonging to
664 the old now corrupted disk). Also for the ramdisk the natural thing
665 to do in order to release the ramdisk memory is to destroy dirty buffers.
666
667 These are two special cases. Normal usage imply the device driver
668 to issue a sync on the device (without waiting I/O completion) and
669 then an invalidate_buffers call that doesn't trash dirty buffers.
670
671 For handling cache coherency with the blkdev pagecache the 'update' case
672 is been introduced. It is needed to re-read from disk any pinned
673 buffer. NOTE: re-reading from disk is destructive so we can do it only
674 when we assume nobody is changing the buffercache under our I/O and when
675 we think the disk contains more recent information than the buffercache.
676 The update == 1 pass marks the buffers we need to update, the update == 2
677 pass does the actual I/O. */
678 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
679 {
680 int i, nlist, slept;
681 struct buffer_head * bh, * bh_next;
682 kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */
683
684 retry:
685 slept = 0;
686 spin_lock(&lru_list_lock);
687 for(nlist = 0; nlist < NR_LIST; nlist++) {
688 bh = lru_list[nlist];
689 if (!bh)
690 continue;
691 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
692 bh_next = bh->b_next_free;
693
694 /* Another device? */
695 if (bh->b_dev != dev)
696 continue;
697 /* Not hashed? */
698 if (!bh->b_pprev)
699 continue;
700 if (buffer_locked(bh)) {
701 get_bh(bh);
702 spin_unlock(&lru_list_lock);
703 wait_on_buffer(bh);
704 slept = 1;
705 spin_lock(&lru_list_lock);
706 put_bh(bh);
707 }
708
709 write_lock(&hash_table_lock);
710 /* All buffers in the lru lists are mapped */
711 if (!buffer_mapped(bh))
712 BUG();
713 if (buffer_dirty(bh) && destroy_dirty_buffers)
714 printk("invalidate: dirty buffer\n");
715 if (!atomic_read(&bh->b_count)) {
716 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
717 remove_inode_queue(bh);
718 }
719 } else if (!bdev->bd_openers)
720 printk("invalidate: busy buffer\n");
721
722 write_unlock(&hash_table_lock);
723 if (slept)
724 goto out;
725 }
726 }
727 out:
728 spin_unlock(&lru_list_lock);
729 if (slept)
730 goto retry;
731
732 /* Get rid of the page cache */
733 invalidate_inode_pages(bdev->bd_inode);
734 }
735
736 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
737 {
738 struct block_device *bdev = bdget(dev);
739 if (bdev) {
740 invalidate_bdev(bdev, destroy_dirty_buffers);
741 bdput(bdev);
742 }
743 }
744
745 static void free_more_memory(void)
746 {
747 balance_dirty();
748 wakeup_bdflush();
749 try_to_free_pages(GFP_NOIO);
750 run_task_queue(&tq_disk);
751 yield();
752 }
753
754 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
755 {
756 bh->b_list = BUF_CLEAN;
757 bh->b_end_io = handler;
758 bh->b_private = private;
759 }
760
761 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
762 {
763 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
764 unsigned long flags;
765 struct buffer_head *tmp;
766 struct page *page;
767 int fullup = 1;
768
769 mark_buffer_uptodate(bh, uptodate);
770
771 /* This is a temporary buffer used for page I/O. */
772 page = bh->b_page;
773
774 if (!uptodate)
775 SetPageError(page);
776
777 /*
778 * Be _very_ careful from here on. Bad things can happen if
779 * two buffer heads end IO at almost the same time and both
780 * decide that the page is now completely done.
781 *
782 * Async buffer_heads are here only as labels for IO, and get
783 * thrown away once the IO for this page is complete. IO is
784 * deemed complete once all buffers have been visited
785 * (b_count==0) and are now unlocked. We must make sure that
786 * only the _last_ buffer that decrements its count is the one
787 * that unlock the page..
788 */
789 spin_lock_irqsave(&page_uptodate_lock, flags);
790 mark_buffer_async(bh, 0);
791 unlock_buffer(bh);
792 tmp = bh->b_this_page;
793 while (tmp != bh) {
794 if (buffer_locked(tmp)) {
795 if (buffer_async(tmp))
796 goto still_busy;
797 } else if (!buffer_uptodate(tmp))
798 fullup = 0;
799 tmp = tmp->b_this_page;
800 }
801
802 /* OK, the async IO on this page is complete. */
803 spin_unlock_irqrestore(&page_uptodate_lock, flags);
804
805 /*
806 * If none of the buffers had errors and all were uptodate
807 * then we can set the page uptodate:
808 */
809 if (fullup && !PageError(page))
810 SetPageUptodate(page);
811
812 UnlockPage(page);
813
814 return;
815
816 still_busy:
817 spin_unlock_irqrestore(&page_uptodate_lock, flags);
818 return;
819 }
820
821 inline void set_buffer_async_io(struct buffer_head *bh)
822 {
823 bh->b_end_io = end_buffer_io_async;
824 mark_buffer_async(bh, 1);
825 }
826
827 /*
828 * Synchronise all the inode's dirty buffers to the disk.
829 *
830 * We have conflicting pressures: we want to make sure that all
831 * initially dirty buffers get waited on, but that any subsequently
832 * dirtied buffers don't. After all, we don't want fsync to last
833 * forever if somebody is actively writing to the file.
834 *
835 * Do this in two main stages: first we copy dirty buffers to a
836 * temporary inode list, queueing the writes as we go. Then we clean
837 * up, waiting for those writes to complete.
838 *
839 * During this second stage, any subsequent updates to the file may end
840 * up refiling the buffer on the original inode's dirty list again, so
841 * there is a chance we will end up with a buffer queued for write but
842 * not yet completed on that list. So, as a final cleanup we go through
843 * the osync code to catch these locked, dirty buffers without requeuing
844 * any newly dirty buffers for write.
845 */
846 int fsync_buffers_list(struct list_head *list)
847 {
848 struct buffer_head *bh;
849 struct list_head tmp;
850 int err = 0, err2;
851
852 INIT_LIST_HEAD(&tmp);
853
854 spin_lock(&lru_list_lock);
855
856 while (!list_empty(list)) {
857 bh = BH_ENTRY(list->next);
858 list_del(&bh->b_inode_buffers);
859 if (!buffer_dirty(bh) && !buffer_locked(bh))
860 clear_buffer_attached(bh);
861 else {
862 set_buffer_attached(bh);
863 list_add(&bh->b_inode_buffers, &tmp);
864 if (buffer_dirty(bh)) {
865 get_bh(bh);
866 spin_unlock(&lru_list_lock);
867 /*
868 * Wait I/O completion before submitting
869 * the buffer, to be sure the write will
870 * be effective on the latest data in
871 * the buffer. (otherwise - if there's old
872 * I/O in flight - write_buffer would become
873 * a noop)
874 */
875 wait_on_buffer(bh);
876 ll_rw_block(WRITE, 1, &bh);
877 brelse(bh);
878 spin_lock(&lru_list_lock);
879 }
880 }
881 }
882
883 while (!list_empty(&tmp)) {
884 bh = BH_ENTRY(tmp.prev);
885 remove_inode_queue(bh);
886 get_bh(bh);
887 spin_unlock(&lru_list_lock);
888 wait_on_buffer(bh);
889 if (!buffer_uptodate(bh))
890 err = -EIO;
891 brelse(bh);
892 spin_lock(&lru_list_lock);
893 }
894
895 spin_unlock(&lru_list_lock);
896 err2 = osync_buffers_list(list);
897
898 if (err)
899 return err;
900 else
901 return err2;
902 }
903
904 /*
905 * osync is designed to support O_SYNC io. It waits synchronously for
906 * all already-submitted IO to complete, but does not queue any new
907 * writes to the disk.
908 *
909 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
910 * you dirty the buffers, and then use osync_buffers_list to wait for
911 * completion. Any other dirty buffers which are not yet queued for
912 * write will not be flushed to disk by the osync.
913 */
914 static int osync_buffers_list(struct list_head *list)
915 {
916 struct buffer_head *bh;
917 struct list_head *p;
918 int err = 0;
919
920 spin_lock(&lru_list_lock);
921
922 repeat:
923 list_for_each_prev(p, list) {
924 bh = BH_ENTRY(p);
925 if (buffer_locked(bh)) {
926 get_bh(bh);
927 spin_unlock(&lru_list_lock);
928 wait_on_buffer(bh);
929 if (!buffer_uptodate(bh))
930 err = -EIO;
931 brelse(bh);
932 spin_lock(&lru_list_lock);
933 goto repeat;
934 }
935 }
936
937 spin_unlock(&lru_list_lock);
938 return err;
939 }
940
941 /*
942 * Invalidate any and all dirty buffers on a given inode. We are
943 * probably unmounting the fs, but that doesn't mean we have already
944 * done a sync(). Just drop the buffers from the inode list.
945 */
946 void invalidate_inode_buffers(struct inode *inode)
947 {
948 struct list_head * entry;
949
950 spin_lock(&lru_list_lock);
951 while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
952 remove_inode_queue(BH_ENTRY(entry));
953 while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
954 remove_inode_queue(BH_ENTRY(entry));
955 spin_unlock(&lru_list_lock);
956 }
957
958
959 /*
960 * Ok, this is getblk, and it isn't very clear, again to hinder
961 * race-conditions. Most of the code is seldom used, (ie repeating),
962 * so it should be much more efficient than it looks.
963 *
964 * The algorithm is changed: hopefully better, and an elusive bug removed.
965 *
966 * 14.02.92: changed it to sync dirty buffers a bit: better performance
967 * when the filesystem starts to get full of dirty blocks (I hope).
968 */
969 struct buffer_head * getblk(kdev_t dev, int block, int size)
970 {
971 for (;;) {
972 struct buffer_head * bh;
973
974 bh = get_hash_table(dev, block, size);
975 if (bh) {
976 touch_buffer(bh);
977 return bh;
978 }
979
980 if (!grow_buffers(dev, block, size))
981 free_more_memory();
982 }
983 }
984
985 /* -1 -> no need to flush
986 0 -> async flush
987 1 -> sync flush (wait for I/O completion) */
988 static int balance_dirty_state(void)
989 {
990 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
991
992 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
993 tot = nr_free_buffer_pages();
994
995 dirty *= 100;
996 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
997 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
998
999 /* First, check for the "real" dirty limit. */
1000 if (dirty > soft_dirty_limit) {
1001 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1002 return 1;
1003 return 0;
1004 }
1005
1006 return -1;
1007 }
1008
1009 static int bdflush_stop(void)
1010 {
1011 unsigned long dirty, tot, dirty_limit;
1012
1013 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1014 tot = nr_free_buffer_pages();
1015
1016 dirty *= 100;
1017 dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1018
1019 if (dirty > dirty_limit)
1020 return 0;
1021 return 1;
1022 }
1023
1024 /*
1025 * if a new dirty buffer is created we need to balance bdflush.
1026 *
1027 * in the future we might want to make bdflush aware of different
1028 * pressures on different devices - thus the (currently unused)
1029 * 'dev' parameter.
1030 */
1031 void balance_dirty(void)
1032 {
1033 int state = balance_dirty_state();
1034
1035 if (state < 0)
1036 return;
1037
1038 wakeup_bdflush();
1039
1040 /*
1041 * And if we're _really_ out of balance, wait for
1042 * some of the dirty/locked buffers ourselves.
1043 * This will throttle heavy writers.
1044 */
1045 if (state > 0) {
1046 spin_lock(&lru_list_lock);
1047 write_some_buffers(NODEV);
1048 }
1049 }
1050 EXPORT_SYMBOL(balance_dirty);
1051
1052 inline void __mark_dirty(struct buffer_head *bh)
1053 {
1054 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1055 refile_buffer(bh);
1056 }
1057
1058 /* atomic version, the user must call balance_dirty() by hand
1059 as soon as it become possible to block */
1060 void __mark_buffer_dirty(struct buffer_head *bh)
1061 {
1062 if (!atomic_set_buffer_dirty(bh))
1063 __mark_dirty(bh);
1064 }
1065
1066 void mark_buffer_dirty(struct buffer_head *bh)
1067 {
1068 if (!atomic_set_buffer_dirty(bh)) {
1069 __mark_dirty(bh);
1070 balance_dirty();
1071 }
1072 }
1073
1074 void set_buffer_flushtime(struct buffer_head *bh)
1075 {
1076 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1077 }
1078 EXPORT_SYMBOL(set_buffer_flushtime);
1079
1080 /*
1081 * A buffer may need to be moved from one buffer list to another
1082 * (e.g. in case it is not shared any more). Handle this.
1083 */
1084 static void __refile_buffer(struct buffer_head *bh)
1085 {
1086 int dispose = BUF_CLEAN;
1087 if (buffer_locked(bh))
1088 dispose = BUF_LOCKED;
1089 if (buffer_dirty(bh))
1090 dispose = BUF_DIRTY;
1091 if (dispose != bh->b_list) {
1092 __remove_from_lru_list(bh);
1093 bh->b_list = dispose;
1094 if (dispose == BUF_CLEAN)
1095 remove_inode_queue(bh);
1096 __insert_into_lru_list(bh, dispose);
1097 }
1098 }
1099
1100 void refile_buffer(struct buffer_head *bh)
1101 {
1102 spin_lock(&lru_list_lock);
1103 __refile_buffer(bh);
1104 spin_unlock(&lru_list_lock);
1105 }
1106
1107 /*
1108 * Release a buffer head
1109 */
1110 void __brelse(struct buffer_head * buf)
1111 {
1112 if (atomic_read(&buf->b_count)) {
1113 put_bh(buf);
1114 return;
1115 }
1116 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1117 }
1118
1119 /*
1120 * bforget() is like brelse(), except it discards any
1121 * potentially dirty data.
1122 */
1123 void __bforget(struct buffer_head * buf)
1124 {
1125 mark_buffer_clean(buf);
1126 __brelse(buf);
1127 }
1128
1129 /**
1130 * bread() - reads a specified block and returns the bh
1131 * @block: number of block
1132 * @size: size (in bytes) to read
1133 *
1134 * Reads a specified block, and returns buffer head that
1135 * contains it. It returns NULL if the block was unreadable.
1136 */
1137 struct buffer_head * bread(kdev_t dev, int block, int size)
1138 {
1139 struct buffer_head * bh;
1140
1141 bh = getblk(dev, block, size);
1142 if (buffer_uptodate(bh))
1143 return bh;
1144 set_bit(BH_Sync, &bh->b_state);
1145 ll_rw_block(READ, 1, &bh);
1146 wait_on_buffer(bh);
1147 if (buffer_uptodate(bh))
1148 return bh;
1149 brelse(bh);
1150 return NULL;
1151 }
1152
1153 /*
1154 * Note: the caller should wake up the buffer_wait list if needed.
1155 */
1156 static void __put_unused_buffer_head(struct buffer_head * bh)
1157 {
1158 if (unlikely(buffer_attached(bh)))
1159 BUG();
1160 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1161 kmem_cache_free(bh_cachep, bh);
1162 } else {
1163 bh->b_dev = B_FREE;
1164 bh->b_blocknr = -1;
1165 bh->b_this_page = NULL;
1166
1167 nr_unused_buffer_heads++;
1168 bh->b_next_free = unused_list;
1169 unused_list = bh;
1170 }
1171 }
1172
1173 void put_unused_buffer_head(struct buffer_head *bh)
1174 {
1175 spin_lock(&unused_list_lock);
1176 __put_unused_buffer_head(bh);
1177 spin_unlock(&unused_list_lock);
1178 }
1179 EXPORT_SYMBOL(put_unused_buffer_head);
1180
1181 /*
1182 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1183 * no-buffer-head deadlock. Return NULL on failure; waiting for
1184 * buffer heads is now handled in create_buffers().
1185 */
1186 struct buffer_head * get_unused_buffer_head(int async)
1187 {
1188 struct buffer_head * bh;
1189
1190 spin_lock(&unused_list_lock);
1191 if (nr_unused_buffer_heads > NR_RESERVED) {
1192 bh = unused_list;
1193 unused_list = bh->b_next_free;
1194 nr_unused_buffer_heads--;
1195 spin_unlock(&unused_list_lock);
1196 return bh;
1197 }
1198 spin_unlock(&unused_list_lock);
1199
1200 /* This is critical. We can't call out to the FS
1201 * to get more buffer heads, because the FS may need
1202 * more buffer-heads itself. Thus SLAB_NOFS.
1203 */
1204 if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1205 bh->b_blocknr = -1;
1206 bh->b_this_page = NULL;
1207 return bh;
1208 }
1209
1210 /*
1211 * If we need an async buffer, use the reserved buffer heads.
1212 */
1213 if (async) {
1214 spin_lock(&unused_list_lock);
1215 if (unused_list) {
1216 bh = unused_list;
1217 unused_list = bh->b_next_free;
1218 nr_unused_buffer_heads--;
1219 spin_unlock(&unused_list_lock);
1220 return bh;
1221 }
1222 spin_unlock(&unused_list_lock);
1223 }
1224
1225 return NULL;
1226 }
1227 EXPORT_SYMBOL(get_unused_buffer_head);
1228
1229 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1230 {
1231 if (offset >= PAGE_SIZE)
1232 BUG();
1233
1234 if (PageHighMem(page)) {
1235 bh->b_data = (char *)offset;
1236 } else {
1237 bh->b_data = page_address(page) + offset;
1238 }
1239 bh->b_page = page;
1240 }
1241 EXPORT_SYMBOL(set_bh_page);
1242
1243 /*
1244 * Create the appropriate buffers when given a page for data area and
1245 * the size of each buffer.. Use the bh->b_this_page linked list to
1246 * follow the buffers created. Return NULL if unable to create more
1247 * buffers.
1248 * The async flag is used to differentiate async IO (paging, swapping)
1249 * from ordinary buffer allocations, and only async requests are allowed
1250 * to sleep waiting for buffer heads.
1251 */
1252 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1253 {
1254 struct buffer_head *bh, *head;
1255 long offset;
1256
1257 try_again:
1258 head = NULL;
1259 offset = PAGE_SIZE;
1260 while ((offset -= size) >= 0) {
1261 bh = get_unused_buffer_head(async);
1262 if (!bh)
1263 goto no_grow;
1264
1265 bh->b_dev = NODEV;
1266 bh->b_this_page = head;
1267 head = bh;
1268
1269 bh->b_state = 0;
1270 bh->b_next_free = NULL;
1271 bh->b_pprev = NULL;
1272 atomic_set(&bh->b_count, 0);
1273 bh->b_size = size;
1274
1275 set_bh_page(bh, page, offset);
1276
1277 bh->b_list = BUF_CLEAN;
1278 bh->b_end_io = NULL;
1279 }
1280 return head;
1281 /*
1282 * In case anything failed, we just free everything we got.
1283 */
1284 no_grow:
1285 if (head) {
1286 spin_lock(&unused_list_lock);
1287 do {
1288 bh = head;
1289 head = head->b_this_page;
1290 __put_unused_buffer_head(bh);
1291 } while (head);
1292 spin_unlock(&unused_list_lock);
1293
1294 /* Wake up any waiters ... */
1295 wake_up(&buffer_wait);
1296 }
1297
1298 /*
1299 * Return failure for non-async IO requests. Async IO requests
1300 * are not allowed to fail, so we have to wait until buffer heads
1301 * become available. But we don't want tasks sleeping with
1302 * partially complete buffers, so all were released above.
1303 */
1304 if (!async)
1305 return NULL;
1306
1307 /* We're _really_ low on memory. Now we just
1308 * wait for old buffer heads to become free due to
1309 * finishing IO. Since this is an async request and
1310 * the reserve list is empty, we're sure there are
1311 * async buffer heads in use.
1312 */
1313 run_task_queue(&tq_disk);
1314
1315 free_more_memory();
1316 goto try_again;
1317 }
1318
1319 /*
1320 * Called when truncating a buffer on a page completely.
1321 */
1322 static void discard_buffer(struct buffer_head * bh)
1323 {
1324 if (buffer_mapped(bh)) {
1325 mark_buffer_clean(bh);
1326 lock_buffer(bh);
1327 clear_bit(BH_Uptodate, &bh->b_state);
1328 clear_bit(BH_Mapped, &bh->b_state);
1329 clear_bit(BH_Req, &bh->b_state);
1330 clear_bit(BH_New, &bh->b_state);
1331 remove_from_queues(bh);
1332 unlock_buffer(bh);
1333 }
1334 }
1335
1336 /**
1337 * try_to_release_page - release old fs-specific metadata on a page
1338 *
1339 */
1340
1341 int try_to_release_page(struct page * page, int gfp_mask)
1342 {
1343 if (!PageLocked(page))
1344 BUG();
1345
1346 if (!page->mapping)
1347 goto try_to_free;
1348 if (!page->mapping->a_ops->releasepage)
1349 goto try_to_free;
1350 if (page->mapping->a_ops->releasepage(page, gfp_mask))
1351 goto try_to_free;
1352 /*
1353 * We couldn't release buffer metadata; don't even bother trying
1354 * to release buffers.
1355 */
1356 return 0;
1357 try_to_free:
1358 return try_to_free_buffers(page, gfp_mask);
1359 }
1360
1361 /*
1362 * We don't have to release all buffers here, but
1363 * we have to be sure that no dirty buffer is left
1364 * and no IO is going on (no buffer is locked), because
1365 * we have truncated the file and are going to free the
1366 * blocks on-disk..
1367 */
1368 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1369 {
1370 struct buffer_head *head, *bh, *next;
1371 unsigned int curr_off = 0;
1372
1373 if (!PageLocked(page))
1374 BUG();
1375 if (!page->buffers)
1376 return 1;
1377
1378 head = page->buffers;
1379 bh = head;
1380 do {
1381 unsigned int next_off = curr_off + bh->b_size;
1382 next = bh->b_this_page;
1383
1384 /*
1385 * is this block fully flushed?
1386 */
1387 if (offset <= curr_off)
1388 discard_buffer(bh);
1389 curr_off = next_off;
1390 bh = next;
1391 } while (bh != head);
1392
1393 /*
1394 * subtle. We release buffer-heads only if this is
1395 * the 'final' flushpage. We have invalidated the get_block
1396 * cached value unconditionally, so real IO is not
1397 * possible anymore.
1398 *
1399 * If the free doesn't work out, the buffers can be
1400 * left around - they just turn into anonymous buffers
1401 * instead.
1402 */
1403 if (!offset) {
1404 if (!try_to_release_page(page, 0))
1405 return 0;
1406 }
1407
1408 return 1;
1409 }
1410
1411 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1412 {
1413 struct buffer_head *bh, *head, *tail;
1414
1415 /* FIXME: create_buffers should fail if there's no enough memory */
1416 head = create_buffers(page, blocksize, 1);
1417 if (page->buffers)
1418 BUG();
1419
1420 bh = head;
1421 do {
1422 bh->b_dev = dev;
1423 bh->b_blocknr = 0;
1424 bh->b_end_io = NULL;
1425 tail = bh;
1426 bh = bh->b_this_page;
1427 } while (bh);
1428 tail->b_this_page = head;
1429 page->buffers = head;
1430 page_cache_get(page);
1431 }
1432 EXPORT_SYMBOL(create_empty_buffers);
1433
1434 /*
1435 * We are taking a block for data and we don't want any output from any
1436 * buffer-cache aliases starting from return from that function and
1437 * until the moment when something will explicitly mark the buffer
1438 * dirty (hopefully that will not happen until we will free that block ;-)
1439 * We don't even need to mark it not-uptodate - nobody can expect
1440 * anything from a newly allocated buffer anyway. We used to used
1441 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1442 * don't want to mark the alias unmapped, for example - it would confuse
1443 * anyone who might pick it with bread() afterwards...
1444 */
1445
1446 static void unmap_underlying_metadata(struct buffer_head * bh)
1447 {
1448 struct buffer_head *old_bh;
1449
1450 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1451 if (old_bh) {
1452 mark_buffer_clean(old_bh);
1453 wait_on_buffer(old_bh);
1454 clear_bit(BH_Req, &old_bh->b_state);
1455 __brelse(old_bh);
1456 }
1457 }
1458
1459 /*
1460 * NOTE! All mapped/uptodate combinations are valid:
1461 *
1462 * Mapped Uptodate Meaning
1463 *
1464 * No No "unknown" - must do get_block()
1465 * No Yes "hole" - zero-filled
1466 * Yes No "allocated" - allocated on disk, not read in
1467 * Yes Yes "valid" - allocated and up-to-date in memory.
1468 *
1469 * "Dirty" is valid only with the last case (mapped+uptodate).
1470 */
1471
1472 /*
1473 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1474 */
1475 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1476 {
1477 int err, i;
1478 unsigned long block;
1479 struct buffer_head *bh, *head;
1480 int need_unlock;
1481
1482 if (!PageLocked(page))
1483 BUG();
1484
1485 if (!page->buffers)
1486 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1487 head = page->buffers;
1488
1489 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1490
1491 bh = head;
1492 i = 0;
1493
1494 /* Stage 1: make sure we have all the buffers mapped! */
1495 do {
1496 /*
1497 * If the buffer isn't up-to-date, we can't be sure
1498 * that the buffer has been initialized with the proper
1499 * block number information etc..
1500 *
1501 * Leave it to the low-level FS to make all those
1502 * decisions (block #0 may actually be a valid block)
1503 */
1504 if (!buffer_mapped(bh)) {
1505 err = get_block(inode, block, bh, 1);
1506 if (err)
1507 goto out;
1508 if (buffer_new(bh))
1509 unmap_underlying_metadata(bh);
1510 }
1511 bh = bh->b_this_page;
1512 block++;
1513 } while (bh != head);
1514
1515 /* Stage 2: lock the buffers, mark them clean */
1516 do {
1517 lock_buffer(bh);
1518 set_buffer_async_io(bh);
1519 set_bit(BH_Uptodate, &bh->b_state);
1520 clear_bit(BH_Dirty, &bh->b_state);
1521 bh = bh->b_this_page;
1522 } while (bh != head);
1523
1524 /* Stage 3: submit the IO */
1525 do {
1526 struct buffer_head *next = bh->b_this_page;
1527 submit_bh(WRITE, bh);
1528 bh = next;
1529 } while (bh != head);
1530
1531 /* Done - end_buffer_io_async will unlock */
1532 SetPageUptodate(page);
1533
1534 wakeup_page_waiters(page);
1535
1536 return 0;
1537
1538 out:
1539 /*
1540 * ENOSPC, or some other error. We may already have added some
1541 * blocks to the file, so we need to write these out to avoid
1542 * exposing stale data.
1543 */
1544 ClearPageUptodate(page);
1545 bh = head;
1546 need_unlock = 1;
1547 /* Recovery: lock and submit the mapped buffers */
1548 do {
1549 if (buffer_mapped(bh)) {
1550 lock_buffer(bh);
1551 set_buffer_async_io(bh);
1552 need_unlock = 0;
1553 }
1554 bh = bh->b_this_page;
1555 } while (bh != head);
1556 do {
1557 struct buffer_head *next = bh->b_this_page;
1558 if (buffer_mapped(bh)) {
1559 set_bit(BH_Uptodate, &bh->b_state);
1560 clear_bit(BH_Dirty, &bh->b_state);
1561 submit_bh(WRITE, bh);
1562 }
1563 bh = next;
1564 } while (bh != head);
1565 if (need_unlock)
1566 UnlockPage(page);
1567 wakeup_page_waiters(page);
1568 return err;
1569 }
1570
1571 static int __block_prepare_write(struct inode *inode, struct page *page,
1572 unsigned from, unsigned to, get_block_t *get_block)
1573 {
1574 unsigned block_start, block_end;
1575 unsigned long block;
1576 int err = 0;
1577 unsigned blocksize, bbits;
1578 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1579 char *kaddr = kmap(page);
1580
1581 blocksize = 1 << inode->i_blkbits;
1582 if (!page->buffers)
1583 create_empty_buffers(page, inode->i_dev, blocksize);
1584 head = page->buffers;
1585
1586 bbits = inode->i_blkbits;
1587 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1588
1589 for(bh = head, block_start = 0; bh != head || !block_start;
1590 block++, block_start=block_end, bh = bh->b_this_page) {
1591 if (!bh)
1592 BUG();
1593 block_end = block_start+blocksize;
1594 if (block_end <= from)
1595 continue;
1596 if (block_start >= to)
1597 break;
1598 clear_bit(BH_New, &bh->b_state);
1599 if (!buffer_mapped(bh)) {
1600 err = get_block(inode, block, bh, 1);
1601 if (err)
1602 goto out;
1603 if (buffer_new(bh)) {
1604 unmap_underlying_metadata(bh);
1605 if (Page_Uptodate(page)) {
1606 set_bit(BH_Uptodate, &bh->b_state);
1607 continue;
1608 }
1609 if (block_end > to)
1610 memset(kaddr+to, 0, block_end-to);
1611 if (block_start < from)
1612 memset(kaddr+block_start, 0, from-block_start);
1613 if (block_end > to || block_start < from)
1614 flush_dcache_page(page);
1615 continue;
1616 }
1617 }
1618 if (Page_Uptodate(page)) {
1619 set_bit(BH_Uptodate, &bh->b_state);
1620 continue;
1621 }
1622 if (!buffer_uptodate(bh) &&
1623 (block_start < from || block_end > to)) {
1624 ll_rw_block(READ, 1, &bh);
1625 *wait_bh++=bh;
1626 }
1627 }
1628 /*
1629 * If we issued read requests - let them complete.
1630 */
1631 while(wait_bh > wait) {
1632 wait_on_buffer(*--wait_bh);
1633 if (!buffer_uptodate(*wait_bh))
1634 return -EIO;
1635 }
1636 return 0;
1637 out:
1638 /*
1639 * Zero out any newly allocated blocks to avoid exposing stale
1640 * data. If BH_New is set, we know that the block was newly
1641 * allocated in the above loop.
1642 *
1643 * Details the buffer can be new and uptodate because:
1644 * 1) hole in uptodate page, get_block(create) allocate the block,
1645 * so the buffer is new and additionally we also mark it uptodate
1646 * 2) The buffer is not mapped and uptodate due a previous partial read.
1647 *
1648 * We can always ignore uptodate buffers here, if you mark a buffer
1649 * uptodate you must make sure it contains the right data first.
1650 *
1651 * We must stop the "undo/clear" fixup pass not at the caller "to"
1652 * but at the last block that we successfully arrived in the main loop.
1653 */
1654 bh = head;
1655 to = block_start; /* stop at the last successfully handled block */
1656 block_start = 0;
1657 do {
1658 block_end = block_start+blocksize;
1659 if (block_end <= from)
1660 goto next_bh;
1661 if (block_start >= to)
1662 break;
1663 if (buffer_new(bh) && !buffer_uptodate(bh)) {
1664 memset(kaddr+block_start, 0, bh->b_size);
1665 flush_dcache_page(page);
1666 set_bit(BH_Uptodate, &bh->b_state);
1667 mark_buffer_dirty(bh);
1668 }
1669 next_bh:
1670 block_start = block_end;
1671 bh = bh->b_this_page;
1672 } while (bh != head);
1673 return err;
1674 }
1675
1676 static int __block_commit_write(struct inode *inode, struct page *page,
1677 unsigned from, unsigned to)
1678 {
1679 unsigned block_start, block_end;
1680 int partial = 0, need_balance_dirty = 0;
1681 unsigned blocksize;
1682 struct buffer_head *bh, *head;
1683
1684 blocksize = 1 << inode->i_blkbits;
1685
1686 for(bh = head = page->buffers, block_start = 0;
1687 bh != head || !block_start;
1688 block_start=block_end, bh = bh->b_this_page) {
1689 block_end = block_start + blocksize;
1690 if (block_end <= from || block_start >= to) {
1691 if (!buffer_uptodate(bh))
1692 partial = 1;
1693 } else {
1694 set_bit(BH_Uptodate, &bh->b_state);
1695 if (!atomic_set_buffer_dirty(bh)) {
1696 __mark_dirty(bh);
1697 buffer_insert_inode_data_queue(bh, inode);
1698 need_balance_dirty = 1;
1699 }
1700 }
1701 }
1702
1703 if (need_balance_dirty)
1704 balance_dirty();
1705 /*
1706 * is this a partial write that happened to make all buffers
1707 * uptodate then we can optimize away a bogus readpage() for
1708 * the next read(). Here we 'discover' wether the page went
1709 * uptodate as a result of this (potentially partial) write.
1710 */
1711 if (!partial)
1712 SetPageUptodate(page);
1713 return 0;
1714 }
1715
1716 /*
1717 * Generic "read page" function for block devices that have the normal
1718 * get_block functionality. This is most of the block device filesystems.
1719 * Reads the page asynchronously --- the unlock_buffer() and
1720 * mark_buffer_uptodate() functions propagate buffer state into the
1721 * page struct once IO has completed.
1722 */
1723 int block_read_full_page(struct page *page, get_block_t *get_block)
1724 {
1725 struct inode *inode = page->mapping->host;
1726 unsigned long iblock, lblock;
1727 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1728 unsigned int blocksize, blocks;
1729 int nr, i;
1730
1731 if (!PageLocked(page))
1732 PAGE_BUG(page);
1733 blocksize = 1 << inode->i_blkbits;
1734 if (!page->buffers)
1735 create_empty_buffers(page, inode->i_dev, blocksize);
1736 head = page->buffers;
1737
1738 blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1739 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1740 lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1741 bh = head;
1742 nr = 0;
1743 i = 0;
1744
1745 do {
1746 if (buffer_uptodate(bh))
1747 continue;
1748
1749 if (!buffer_mapped(bh)) {
1750 if (iblock < lblock) {
1751 if (get_block(inode, iblock, bh, 0))
1752 continue;
1753 }
1754 if (!buffer_mapped(bh)) {
1755 memset(kmap(page) + i*blocksize, 0, blocksize);
1756 flush_dcache_page(page);
1757 kunmap(page);
1758 set_bit(BH_Uptodate, &bh->b_state);
1759 continue;
1760 }
1761 /* get_block() might have updated the buffer synchronously */
1762 if (buffer_uptodate(bh))
1763 continue;
1764 }
1765
1766 arr[nr] = bh;
1767 nr++;
1768 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1769
1770 if (!nr) {
1771 /*
1772 * all buffers are uptodate - we can set the page
1773 * uptodate as well.
1774 */
1775 SetPageUptodate(page);
1776 UnlockPage(page);
1777 return 0;
1778 }
1779
1780 /* Stage two: lock the buffers */
1781 for (i = 0; i < nr; i++) {
1782 struct buffer_head * bh = arr[i];
1783 lock_buffer(bh);
1784 set_buffer_async_io(bh);
1785 }
1786
1787 /* Stage 3: start the IO */
1788 for (i = 0; i < nr; i++) {
1789 struct buffer_head * bh = arr[i];
1790 if (buffer_uptodate(bh))
1791 end_buffer_io_async(bh, 1);
1792 else
1793 submit_bh(READ, bh);
1794 }
1795
1796 wakeup_page_waiters(page);
1797
1798 return 0;
1799 }
1800
1801 /* utility function for filesystems that need to do work on expanding
1802 * truncates. Uses prepare/commit_write to allow the filesystem to
1803 * deal with the hole.
1804 */
1805 int generic_cont_expand(struct inode *inode, loff_t size)
1806 {
1807 struct address_space *mapping = inode->i_mapping;
1808 struct page *page;
1809 unsigned long index, offset, limit;
1810 int err;
1811
1812 err = -EFBIG;
1813 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1814 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1815 send_sig(SIGXFSZ, current, 0);
1816 goto out;
1817 }
1818 if (size > inode->i_sb->s_maxbytes)
1819 goto out;
1820
1821 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1822
1823 /* ugh. in prepare/commit_write, if from==to==start of block, we
1824 ** skip the prepare. make sure we never send an offset for the start
1825 ** of a block
1826 */
1827 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1828 offset++;
1829 }
1830 index = size >> PAGE_CACHE_SHIFT;
1831 err = -ENOMEM;
1832 page = grab_cache_page(mapping, index);
1833 if (!page)
1834 goto out;
1835 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1836 if (!err) {
1837 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1838 }
1839 UnlockPage(page);
1840 page_cache_release(page);
1841 if (err > 0)
1842 err = 0;
1843 out:
1844 return err;
1845 }
1846
1847 /*
1848 * For moronic filesystems that do not allow holes in file.
1849 * We may have to extend the file.
1850 */
1851
1852 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1853 {
1854 struct address_space *mapping = page->mapping;
1855 struct inode *inode = mapping->host;
1856 struct page *new_page;
1857 unsigned long pgpos;
1858 long status;
1859 unsigned zerofrom;
1860 unsigned blocksize = 1 << inode->i_blkbits;
1861 char *kaddr;
1862
1863 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1864 status = -ENOMEM;
1865 new_page = grab_cache_page(mapping, pgpos);
1866 if (!new_page)
1867 goto out;
1868 /* we might sleep */
1869 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1870 UnlockPage(new_page);
1871 page_cache_release(new_page);
1872 continue;
1873 }
1874 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1875 if (zerofrom & (blocksize-1)) {
1876 *bytes |= (blocksize-1);
1877 (*bytes)++;
1878 }
1879 status = __block_prepare_write(inode, new_page, zerofrom,
1880 PAGE_CACHE_SIZE, get_block);
1881 if (status)
1882 goto out_unmap;
1883 kaddr = page_address(new_page);
1884 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1885 flush_dcache_page(new_page);
1886 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1887 kunmap(new_page);
1888 UnlockPage(new_page);
1889 page_cache_release(new_page);
1890 }
1891
1892 if (page->index < pgpos) {
1893 /* completely inside the area */
1894 zerofrom = offset;
1895 } else {
1896 /* page covers the boundary, find the boundary offset */
1897 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1898
1899 /* if we will expand the thing last block will be filled */
1900 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1901 *bytes |= (blocksize-1);
1902 (*bytes)++;
1903 }
1904
1905 /* starting below the boundary? Nothing to zero out */
1906 if (offset <= zerofrom)
1907 zerofrom = offset;
1908 }
1909 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1910 if (status)
1911 goto out1;
1912 kaddr = page_address(page);
1913 if (zerofrom < offset) {
1914 memset(kaddr+zerofrom, 0, offset-zerofrom);
1915 flush_dcache_page(page);
1916 __block_commit_write(inode, page, zerofrom, offset);
1917 }
1918 return 0;
1919 out1:
1920 ClearPageUptodate(page);
1921 kunmap(page);
1922 return status;
1923
1924 out_unmap:
1925 ClearPageUptodate(new_page);
1926 kunmap(new_page);
1927 UnlockPage(new_page);
1928 page_cache_release(new_page);
1929 out:
1930 return status;
1931 }
1932
1933 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1934 get_block_t *get_block)
1935 {
1936 struct inode *inode = page->mapping->host;
1937 int err = __block_prepare_write(inode, page, from, to, get_block);
1938 if (err) {
1939 ClearPageUptodate(page);
1940 kunmap(page);
1941 }
1942 return err;
1943 }
1944
1945 int block_commit_write(struct page *page, unsigned from, unsigned to)
1946 {
1947 struct inode *inode = page->mapping->host;
1948 __block_commit_write(inode,page,from,to);
1949 kunmap(page);
1950 return 0;
1951 }
1952
1953 int generic_commit_write(struct file *file, struct page *page,
1954 unsigned from, unsigned to)
1955 {
1956 struct inode *inode = page->mapping->host;
1957 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1958 __block_commit_write(inode,page,from,to);
1959 kunmap(page);
1960 if (pos > inode->i_size) {
1961 inode->i_size = pos;
1962 mark_inode_dirty(inode);
1963 }
1964 return 0;
1965 }
1966
1967 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1968 {
1969 unsigned long index = from >> PAGE_CACHE_SHIFT;
1970 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1971 unsigned blocksize, iblock, length, pos;
1972 struct inode *inode = mapping->host;
1973 struct page *page;
1974 struct buffer_head *bh;
1975 int err;
1976
1977 blocksize = 1 << inode->i_blkbits;
1978 length = offset & (blocksize - 1);
1979
1980 /* Block boundary? Nothing to do */
1981 if (!length)
1982 return 0;
1983
1984 length = blocksize - length;
1985 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1986
1987 page = grab_cache_page(mapping, index);
1988 err = -ENOMEM;
1989 if (!page)
1990 goto out;
1991
1992 if (!page->buffers)
1993 create_empty_buffers(page, inode->i_dev, blocksize);
1994
1995 /* Find the buffer that contains "offset" */
1996 bh = page->buffers;
1997 pos = blocksize;
1998 while (offset >= pos) {
1999 bh = bh->b_this_page;
2000 iblock++;
2001 pos += blocksize;
2002 }
2003
2004 err = 0;
2005 if (!buffer_mapped(bh)) {
2006 /* Hole? Nothing to do */
2007 if (buffer_uptodate(bh))
2008 goto unlock;
2009 get_block(inode, iblock, bh, 0);
2010 /* Still unmapped? Nothing to do */
2011 if (!buffer_mapped(bh))
2012 goto unlock;
2013 }
2014
2015 /* Ok, it's mapped. Make sure it's up-to-date */
2016 if (Page_Uptodate(page))
2017 set_bit(BH_Uptodate, &bh->b_state);
2018
2019 if (!buffer_uptodate(bh)) {
2020 err = -EIO;
2021 ll_rw_block(READ, 1, &bh);
2022 wait_on_buffer(bh);
2023 /* Uhhuh. Read error. Complain and punt. */
2024 if (!buffer_uptodate(bh))
2025 goto unlock;
2026 }
2027
2028 memset(kmap(page) + offset, 0, length);
2029 flush_dcache_page(page);
2030 kunmap(page);
2031
2032 if (!atomic_set_buffer_dirty(bh)) {
2033 __mark_dirty(bh);
2034 buffer_insert_inode_data_queue(bh, inode);
2035 balance_dirty();
2036 }
2037
2038 err = 0;
2039
2040 unlock:
2041 UnlockPage(page);
2042 page_cache_release(page);
2043 out:
2044 return err;
2045 }
2046
2047 int block_write_full_page(struct page *page, get_block_t *get_block)
2048 {
2049 struct inode *inode = page->mapping->host;
2050 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2051 unsigned offset;
2052 int err;
2053
2054 /* easy case */
2055 if (page->index < end_index)
2056 return __block_write_full_page(inode, page, get_block);
2057
2058 /* things got complicated... */
2059 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2060 /* OK, are we completely out? */
2061 if (page->index >= end_index+1 || !offset) {
2062 UnlockPage(page);
2063 return -EIO;
2064 }
2065
2066 /* Sigh... will have to work, then... */
2067 err = __block_prepare_write(inode, page, 0, offset, get_block);
2068 if (!err) {
2069 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2070 flush_dcache_page(page);
2071 __block_commit_write(inode,page,0,offset);
2072 done:
2073 kunmap(page);
2074 UnlockPage(page);
2075 return err;
2076 }
2077 ClearPageUptodate(page);
2078 goto done;
2079 }
2080
2081 /*
2082 * Commence writeout of all the buffers against a page. The
2083 * page must be locked. Returns zero on success or a negative
2084 * errno.
2085 */
2086 int writeout_one_page(struct page *page)
2087 {
2088 struct buffer_head *bh, *head = page->buffers;
2089
2090 if (!PageLocked(page))
2091 BUG();
2092 bh = head;
2093 do {
2094 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2095 continue;
2096
2097 bh->b_flushtime = jiffies;
2098 ll_rw_block(WRITE, 1, &bh);
2099 } while ((bh = bh->b_this_page) != head);
2100 return 0;
2101 }
2102 EXPORT_SYMBOL(writeout_one_page);
2103
2104 /*
2105 * Wait for completion of I/O of all buffers against a page. The page
2106 * must be locked. Returns zero on success or a negative errno.
2107 */
2108 int waitfor_one_page(struct page *page)
2109 {
2110 int error = 0;
2111 struct buffer_head *bh, *head = page->buffers;
2112
2113 bh = head;
2114 do {
2115 wait_on_buffer(bh);
2116 if (buffer_req(bh) && !buffer_uptodate(bh))
2117 error = -EIO;
2118 } while ((bh = bh->b_this_page) != head);
2119 return error;
2120 }
2121 EXPORT_SYMBOL(waitfor_one_page);
2122
2123 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2124 {
2125 struct buffer_head tmp;
2126 struct inode *inode = mapping->host;
2127 tmp.b_state = 0;
2128 tmp.b_blocknr = 0;
2129 get_block(inode, block, &tmp, 0);
2130 return tmp.b_blocknr;
2131 }
2132
2133 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2134 {
2135 int i, nr_blocks, retval;
2136 unsigned long * blocks = iobuf->blocks;
2137 int length;
2138 int beyond_eof = 0;
2139
2140 length = iobuf->length;
2141 nr_blocks = length / blocksize;
2142 /* build the blocklist */
2143 for (i = 0; i < nr_blocks; i++, blocknr++) {
2144 struct buffer_head bh;
2145
2146 bh.b_state = 0;
2147 bh.b_dev = inode->i_dev;
2148 bh.b_size = blocksize;
2149 bh.b_page = NULL;
2150
2151 if (((loff_t) blocknr) * blocksize >= inode->i_size)
2152 beyond_eof = 1;
2153
2154 /* Only allow get_block to create new blocks if we are safely
2155 beyond EOF. O_DIRECT is unsafe inside sparse files. */
2156 retval = get_block(inode, blocknr, &bh,
2157 ((rw != READ) && beyond_eof));
2158
2159 if (retval) {
2160 if (!i)
2161 /* report error to userspace */
2162 goto out;
2163 else
2164 /* do short I/O until 'i' */
2165 break;
2166 }
2167
2168 if (rw == READ) {
2169 if (buffer_new(&bh))
2170 BUG();
2171 if (!buffer_mapped(&bh)) {
2172 /* there was an hole in the filesystem */
2173 blocks[i] = -1UL;
2174 continue;
2175 }
2176 } else {
2177 if (buffer_new(&bh))
2178 unmap_underlying_metadata(&bh);
2179 if (!buffer_mapped(&bh))
2180 /* upper layers need to pass the error on or
2181 * fall back to buffered IO. */
2182 return -ENOTBLK;
2183 }
2184 blocks[i] = bh.b_blocknr;
2185 }
2186
2187 /* patch length to handle short I/O */
2188 iobuf->length = i * blocksize;
2189 if (!beyond_eof)
2190 up(&inode->i_sem);
2191 retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2192 if (!beyond_eof)
2193 down(&inode->i_sem);
2194 /* restore orig length */
2195 iobuf->length = length;
2196 out:
2197
2198 return retval;
2199 }
2200
2201 /*
2202 * IO completion routine for a buffer_head being used for kiobuf IO: we
2203 * can't dispatch the kiobuf callback until io_count reaches 0.
2204 */
2205
2206 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2207 {
2208 struct kiobuf *kiobuf;
2209
2210 mark_buffer_uptodate(bh, uptodate);
2211
2212 kiobuf = bh->b_private;
2213 unlock_buffer(bh);
2214 end_kio_request(kiobuf, uptodate);
2215 }
2216
2217 /*
2218 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2219 * for them to complete. Clean up the buffer_heads afterwards.
2220 */
2221
2222 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2223 {
2224 int iosize, err;
2225 int i;
2226 struct buffer_head *tmp;
2227
2228 iosize = 0;
2229 err = 0;
2230
2231 for (i = nr; --i >= 0; ) {
2232 iosize += size;
2233 tmp = bh[i];
2234 wait_on_buffer(tmp);
2235
2236 if (!buffer_uptodate(tmp)) {
2237 /* We are traversing bh'es in reverse order so
2238 clearing iosize on error calculates the
2239 amount of IO before the first error. */
2240 iosize = 0;
2241 err = -EIO;
2242 }
2243 }
2244
2245 if (iosize)
2246 return iosize;
2247 return err;
2248 }
2249
2250 /*
2251 * Start I/O on a physical range of kernel memory, defined by a vector
2252 * of kiobuf structs (much like a user-space iovec list).
2253 *
2254 * The kiobuf must already be locked for IO. IO is submitted
2255 * asynchronously: you need to check page->locked and page->uptodate.
2256 *
2257 * It is up to the caller to make sure that there are enough blocks
2258 * passed in to completely map the iobufs to disk.
2259 */
2260
2261 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2262 kdev_t dev, unsigned long b[], int size)
2263 {
2264 int err;
2265 int length;
2266 int transferred;
2267 int i;
2268 int bufind;
2269 int pageind;
2270 int bhind;
2271 int offset;
2272 unsigned long blocknr;
2273 struct kiobuf * iobuf = NULL;
2274 struct page * map;
2275 struct buffer_head *tmp, **bhs = NULL;
2276
2277 if (!nr)
2278 return 0;
2279
2280 /*
2281 * First, do some alignment and validity checks
2282 */
2283 for (i = 0; i < nr; i++) {
2284 iobuf = iovec[i];
2285 if ((iobuf->offset & (size-1)) ||
2286 (iobuf->length & (size-1)))
2287 return -EINVAL;
2288 if (!iobuf->nr_pages)
2289 panic("brw_kiovec: iobuf not initialised");
2290 }
2291
2292 /*
2293 * OK to walk down the iovec doing page IO on each page we find.
2294 */
2295 bufind = bhind = transferred = err = 0;
2296 for (i = 0; i < nr; i++) {
2297 iobuf = iovec[i];
2298 offset = iobuf->offset;
2299 length = iobuf->length;
2300 iobuf->errno = 0;
2301 if (!bhs)
2302 bhs = iobuf->bh;
2303
2304 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2305 map = iobuf->maplist[pageind];
2306 if (!map) {
2307 err = -EFAULT;
2308 goto finished;
2309 }
2310
2311 while (length > 0) {
2312 blocknr = b[bufind++];
2313 if (blocknr == -1UL) {
2314 if (rw == READ) {
2315 /* there was an hole in the filesystem */
2316 memset(kmap(map) + offset, 0, size);
2317 flush_dcache_page(map);
2318 kunmap(map);
2319
2320 transferred += size;
2321 goto skip_block;
2322 } else
2323 BUG();
2324 }
2325 tmp = bhs[bhind++];
2326
2327 tmp->b_size = size;
2328 set_bh_page(tmp, map, offset);
2329 tmp->b_this_page = tmp;
2330
2331 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2332 tmp->b_dev = dev;
2333 tmp->b_blocknr = blocknr;
2334 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2335
2336 if (rw == WRITE) {
2337 set_bit(BH_Uptodate, &tmp->b_state);
2338 clear_bit(BH_Dirty, &tmp->b_state);
2339 } else
2340 set_bit(BH_Uptodate, &tmp->b_state);
2341
2342 atomic_inc(&iobuf->io_count);
2343 submit_bh(rw, tmp);
2344 /*
2345 * Wait for IO if we have got too much
2346 */
2347 if (bhind >= KIO_MAX_SECTORS) {
2348 kiobuf_wait_for_io(iobuf); /* wake-one */
2349 err = wait_kio(rw, bhind, bhs, size);
2350 if (err >= 0)
2351 transferred += err;
2352 else
2353 goto finished;
2354 bhind = 0;
2355 }
2356
2357 skip_block:
2358 length -= size;
2359 offset += size;
2360
2361 if (offset >= PAGE_SIZE) {
2362 offset = 0;
2363 break;
2364 }
2365 } /* End of block loop */
2366 } /* End of page loop */
2367 } /* End of iovec loop */
2368
2369 /* Is there any IO still left to submit? */
2370 if (bhind) {
2371 kiobuf_wait_for_io(iobuf); /* wake-one */
2372 err = wait_kio(rw, bhind, bhs, size);
2373 if (err >= 0)
2374 transferred += err;
2375 else
2376 goto finished;
2377 }
2378
2379 finished:
2380 if (transferred)
2381 return transferred;
2382 return err;
2383 }
2384
2385 /*
2386 * Start I/O on a page.
2387 * This function expects the page to be locked and may return
2388 * before I/O is complete. You then have to check page->locked
2389 * and page->uptodate.
2390 *
2391 * brw_page() is SMP-safe, although it's being called with the
2392 * kernel lock held - but the code is ready.
2393 *
2394 * FIXME: we need a swapper_inode->get_block function to remove
2395 * some of the bmap kludges and interface ugliness here.
2396 */
2397 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2398 {
2399 struct buffer_head *head, *bh;
2400
2401 if (!PageLocked(page))
2402 panic("brw_page: page not locked for I/O");
2403
2404 if (!page->buffers)
2405 create_empty_buffers(page, dev, size);
2406 head = bh = page->buffers;
2407
2408 /* Stage 1: lock all the buffers */
2409 do {
2410 lock_buffer(bh);
2411 bh->b_blocknr = *(b++);
2412 set_bit(BH_Mapped, &bh->b_state);
2413 set_buffer_async_io(bh);
2414 bh = bh->b_this_page;
2415 } while (bh != head);
2416
2417 /* Stage 2: start the IO */
2418 do {
2419 struct buffer_head *next = bh->b_this_page;
2420 submit_bh(rw, bh);
2421 bh = next;
2422 } while (bh != head);
2423 wakeup_page_waiters(page);
2424 return 0;
2425 }
2426
2427 int block_symlink(struct inode *inode, const char *symname, int len)
2428 {
2429 struct address_space *mapping = inode->i_mapping;
2430 struct page *page = grab_cache_page(mapping, 0);
2431 int err = -ENOMEM;
2432 char *kaddr;
2433
2434 if (!page)
2435 goto fail;
2436 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2437 if (err)
2438 goto fail_map;
2439 kaddr = page_address(page);
2440 memcpy(kaddr, symname, len-1);
2441 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2442 /*
2443 * Notice that we are _not_ going to block here - end of page is
2444 * unmapped, so this will only try to map the rest of page, see
2445 * that it is unmapped (typically even will not look into inode -
2446 * ->i_size will be enough for everything) and zero it out.
2447 * OTOH it's obviously correct and should make the page up-to-date.
2448 */
2449 err = mapping->a_ops->readpage(NULL, page);
2450 wait_on_page(page);
2451 page_cache_release(page);
2452 if (err < 0)
2453 goto fail;
2454 mark_inode_dirty(inode);
2455 return 0;
2456 fail_map:
2457 UnlockPage(page);
2458 page_cache_release(page);
2459 fail:
2460 return err;
2461 }
2462
2463 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2464 {
2465 struct buffer_head *bh, *tail;
2466
2467 bh = head;
2468 do {
2469 tail = bh;
2470 bh = bh->b_this_page;
2471 } while (bh);
2472 tail->b_this_page = head;
2473 page->buffers = head;
2474 page_cache_get(page);
2475 }
2476
2477 /*
2478 * Create the page-cache page that contains the requested block
2479 */
2480 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2481 {
2482 struct page * page;
2483 struct buffer_head *bh;
2484
2485 page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2486 if (!page)
2487 return NULL;
2488
2489 if (!PageLocked(page))
2490 BUG();
2491
2492 bh = page->buffers;
2493 if (bh) {
2494 if (bh->b_size == size)
2495 return page;
2496 if (!try_to_free_buffers(page, GFP_NOFS))
2497 goto failed;
2498 }
2499
2500 bh = create_buffers(page, size, 0);
2501 if (!bh)
2502 goto failed;
2503 link_dev_buffers(page, bh);
2504 return page;
2505
2506 failed:
2507 UnlockPage(page);
2508 page_cache_release(page);
2509 return NULL;
2510 }
2511
2512 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2513 {
2514 struct buffer_head *head = page->buffers;
2515 struct buffer_head *bh = head;
2516 unsigned int uptodate;
2517
2518 uptodate = 1 << BH_Mapped;
2519 if (Page_Uptodate(page))
2520 uptodate |= 1 << BH_Uptodate;
2521
2522 write_lock(&hash_table_lock);
2523 do {
2524 if (!(bh->b_state & (1 << BH_Mapped))) {
2525 init_buffer(bh, NULL, NULL);
2526 bh->b_dev = dev;
2527 bh->b_blocknr = block;
2528 bh->b_state = uptodate;
2529 }
2530
2531 /* Insert the buffer into the hash lists if necessary */
2532 if (!bh->b_pprev)
2533 __insert_into_hash_list(bh);
2534
2535 block++;
2536 bh = bh->b_this_page;
2537 } while (bh != head);
2538 write_unlock(&hash_table_lock);
2539 }
2540
2541 /*
2542 * Try to increase the number of buffers available: the size argument
2543 * is used to determine what kind of buffers we want.
2544 */
2545 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2546 {
2547 struct page * page;
2548 struct block_device *bdev;
2549 unsigned long index;
2550 int sizebits;
2551
2552 /* Size must be multiple of hard sectorsize */
2553 if (size & (get_hardsect_size(dev)-1))
2554 BUG();
2555 /* Size must be within 512 bytes and PAGE_SIZE */
2556 if (size < 512 || size > PAGE_SIZE)
2557 BUG();
2558
2559 sizebits = -1;
2560 do {
2561 sizebits++;
2562 } while ((size << sizebits) < PAGE_SIZE);
2563
2564 index = block >> sizebits;
2565 block = index << sizebits;
2566
2567 bdev = bdget(kdev_t_to_nr(dev));
2568 if (!bdev) {
2569 printk("No block device for %s\n", kdevname(dev));
2570 BUG();
2571 }
2572
2573 /* Create a page with the proper size buffers.. */
2574 page = grow_dev_page(bdev, index, size);
2575
2576 /* This is "wrong" - talk to Al Viro */
2577 atomic_dec(&bdev->bd_count);
2578 if (!page)
2579 return 0;
2580
2581 /* Hash in the buffers on the hash list */
2582 hash_page_buffers(page, dev, block, size);
2583 UnlockPage(page);
2584 page_cache_release(page);
2585
2586 /* We hashed up this page, so increment buffermem */
2587 atomic_inc(&buffermem_pages);
2588 return 1;
2589 }
2590
2591 /*
2592 * The first time the VM inspects a page which has locked buffers, it
2593 * will just mark it as needing waiting upon on the scan of the page LRU.
2594 * BH_Wait_IO is used for this.
2595 *
2596 * The second time the VM visits the page, if it still has locked
2597 * buffers, it is time to start writing them out. (BH_Wait_IO was set).
2598 *
2599 * The third time the VM visits the page, if the I/O hasn't completed
2600 * then it's time to wait upon writeout. BH_Lock and BH_Launder are
2601 * used for this.
2602 *
2603 * There is also the case of buffers which were locked by someone else
2604 * - write(2) callers, bdflush, etc. There can be a huge number of these
2605 * and we don't want to just skip them all and fail the page allocation.
2606 * We want to be able to wait on these buffers as well.
2607 *
2608 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2609 * underway against the buffer, doesn't matter who started it - we know
2610 * that the buffer will eventually come unlocked, and so it's safe to
2611 * wait on it.
2612 *
2613 * The caller holds the page lock and the caller will free this page
2614 * into current->local_page, so by waiting on the page's buffers the
2615 * caller is guaranteed to obtain this page.
2616 *
2617 * sync_page_buffers() will sort-of return true if all the buffers
2618 * against this page are freeable, so try_to_free_buffers() should
2619 * try to free the page's buffers a second time. This is a bit
2620 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2621 */
2622 static int sync_page_buffers(struct buffer_head *head)
2623 {
2624 struct buffer_head * bh = head;
2625 int tryagain = 1;
2626
2627 do {
2628 if (!buffer_dirty(bh) && !buffer_locked(bh))
2629 continue;
2630
2631 /* Don't start IO first time around.. */
2632 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2633 tryagain = 0;
2634 continue;
2635 }
2636
2637 /* Second time through we start actively writing out.. */
2638 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2639 if (unlikely(!buffer_launder(bh))) {
2640 tryagain = 0;
2641 continue;
2642 }
2643 wait_on_buffer(bh);
2644 tryagain = 1;
2645 continue;
2646 }
2647
2648 if (!atomic_set_buffer_clean(bh)) {
2649 unlock_buffer(bh);
2650 continue;
2651 }
2652
2653 __mark_buffer_clean(bh);
2654 get_bh(bh);
2655 bh->b_end_io = end_buffer_io_sync;
2656 submit_bh(WRITE, bh);
2657 tryagain = 0;
2658 } while ((bh = bh->b_this_page) != head);
2659
2660 return tryagain;
2661 }
2662
2663 /*
2664 * Can the buffer be thrown out?
2665 */
2666 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
2667 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2668
2669 /*
2670 * try_to_free_buffers() checks if all the buffers on this particular page
2671 * are unused, and free's the page if so.
2672 *
2673 * Wake up bdflush() if this fails - if we're running low on memory due
2674 * to dirty buffers, we need to flush them out as quickly as possible.
2675 *
2676 * NOTE: There are quite a number of ways that threads of control can
2677 * obtain a reference to a buffer head within a page. So we must
2678 * lock out all of these paths to cleanly toss the page.
2679 */
2680 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2681 {
2682 struct buffer_head * tmp, * bh = page->buffers;
2683
2684 cleaned_buffers_try_again:
2685 spin_lock(&lru_list_lock);
2686 write_lock(&hash_table_lock);
2687 tmp = bh;
2688 do {
2689 if (buffer_busy(tmp))
2690 goto busy_buffer_page;
2691 tmp = tmp->b_this_page;
2692 } while (tmp != bh);
2693
2694 spin_lock(&unused_list_lock);
2695 tmp = bh;
2696
2697 /* if this buffer was hashed, this page counts as buffermem */
2698 if (bh->b_pprev)
2699 atomic_dec(&buffermem_pages);
2700 do {
2701 struct buffer_head * p = tmp;
2702 tmp = tmp->b_this_page;
2703
2704 if (p->b_dev == B_FREE) BUG();
2705
2706 remove_inode_queue(p);
2707 __remove_from_queues(p);
2708 __put_unused_buffer_head(p);
2709 } while (tmp != bh);
2710 spin_unlock(&unused_list_lock);
2711
2712 /* Wake up anyone waiting for buffer heads */
2713 wake_up(&buffer_wait);
2714
2715 /* And free the page */
2716 page->buffers = NULL;
2717 page_cache_release(page);
2718 write_unlock(&hash_table_lock);
2719 spin_unlock(&lru_list_lock);
2720 return 1;
2721
2722 busy_buffer_page:
2723 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2724 write_unlock(&hash_table_lock);
2725 spin_unlock(&lru_list_lock);
2726 gfp_mask = pf_gfp_mask(gfp_mask);
2727 if (gfp_mask & __GFP_IO) {
2728 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2729 if (sync_page_buffers(bh)) {
2730 /* no IO or waiting next time */
2731 gfp_mask = 0;
2732 goto cleaned_buffers_try_again;
2733 }
2734 }
2735 }
2736 if (balance_dirty_state() >= 0)
2737 wakeup_bdflush();
2738 return 0;
2739 }
2740 EXPORT_SYMBOL(try_to_free_buffers);
2741
2742 /* ================== Debugging =================== */
2743
2744 void show_buffers(void)
2745 {
2746 #ifdef CONFIG_SMP
2747 struct buffer_head * bh;
2748 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2749 int nlist;
2750 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2751 #endif
2752
2753 printk("Buffer memory: %6dkB\n",
2754 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2755
2756 printk("Cache memory: %6dkB\n",
2757 (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2758
2759 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2760 if (!spin_trylock(&lru_list_lock))
2761 return;
2762 for(nlist = 0; nlist < NR_LIST; nlist++) {
2763 found = locked = dirty = used = lastused = 0;
2764 bh = lru_list[nlist];
2765 if(!bh) continue;
2766
2767 do {
2768 found++;
2769 if (buffer_locked(bh))
2770 locked++;
2771 if (buffer_dirty(bh))
2772 dirty++;
2773 if (atomic_read(&bh->b_count))
2774 used++, lastused = found;
2775 bh = bh->b_next_free;
2776 } while (bh != lru_list[nlist]);
2777 {
2778 int tmp = nr_buffers_type[nlist];
2779 if (found != tmp)
2780 printk("%9s: BUG -> found %d, reported %d\n",
2781 buf_types[nlist], found, tmp);
2782 }
2783 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2784 "%d locked, %d dirty\n",
2785 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2786 used, lastused, locked, dirty);
2787 }
2788 spin_unlock(&lru_list_lock);
2789 #endif
2790 }
2791
2792 /* ===================== Init ======================= */
2793
2794 /*
2795 * allocate the hash table and init the free list
2796 * Use gfp() for the hash table to decrease TLB misses, use
2797 * SLAB cache for buffer heads.
2798 */
2799 void __init buffer_init(unsigned long mempages)
2800 {
2801 int order, i;
2802 unsigned int nr_hash;
2803
2804 /* The buffer cache hash table is less important these days,
2805 * trim it a bit.
2806 */
2807 mempages >>= 14;
2808
2809 mempages *= sizeof(struct buffer_head *);
2810
2811 for (order = 0; (1 << order) < mempages; order++)
2812 ;
2813
2814 /* try to allocate something until we get it or we're asking
2815 for something that is really too small */
2816
2817 do {
2818 unsigned long tmp;
2819
2820 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2821 bh_hash_mask = (nr_hash - 1);
2822
2823 tmp = nr_hash;
2824 bh_hash_shift = 0;
2825 while((tmp >>= 1UL) != 0UL)
2826 bh_hash_shift++;
2827
2828 hash_table = (struct buffer_head **)
2829 __get_free_pages(GFP_ATOMIC, order);
2830 } while (hash_table == NULL && --order > 0);
2831 printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2832 nr_hash, order, (PAGE_SIZE << order));
2833
2834 if (!hash_table)
2835 panic("Failed to allocate buffer hash table\n");
2836
2837 /* Setup hash chains. */
2838 for(i = 0; i < nr_hash; i++)
2839 hash_table[i] = NULL;
2840
2841 /* Setup lru lists. */
2842 for(i = 0; i < NR_LIST; i++)
2843 lru_list[i] = NULL;
2844
2845 }
2846
2847
2848 /* ====================== bdflush support =================== */
2849
2850 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2851 * response to dirty buffers. Once this process is activated, we write back
2852 * a limited number of buffers to the disks and then go back to sleep again.
2853 */
2854
2855 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2856
2857 void wakeup_bdflush(void)
2858 {
2859 wake_up_interruptible(&bdflush_wait);
2860 }
2861
2862 /*
2863 * Here we attempt to write back old buffers. We also try to flush inodes
2864 * and supers as well, since this function is essentially "update", and
2865 * otherwise there would be no way of ensuring that these quantities ever
2866 * get written back. Ideally, we would have a timestamp on the inodes
2867 * and superblocks so that we could write back only the old ones as well
2868 */
2869
2870 static int sync_old_buffers(void)
2871 {
2872 lock_kernel();
2873 sync_unlocked_inodes();
2874 sync_supers(0, 0);
2875 unlock_kernel();
2876
2877 for (;;) {
2878 struct buffer_head *bh;
2879
2880 spin_lock(&lru_list_lock);
2881 bh = lru_list[BUF_DIRTY];
2882 if (!bh || time_before(jiffies, bh->b_flushtime))
2883 break;
2884 if (write_some_buffers(NODEV))
2885 continue;
2886 return 0;
2887 }
2888 spin_unlock(&lru_list_lock);
2889 return 0;
2890 }
2891
2892 int block_sync_page(struct page *page)
2893 {
2894 run_task_queue(&tq_disk);
2895 return 0;
2896 }
2897
2898 /* This is the interface to bdflush. As we get more sophisticated, we can
2899 * pass tuning parameters to this "process", to adjust how it behaves.
2900 * We would want to verify each parameter, however, to make sure that it
2901 * is reasonable. */
2902
2903 asmlinkage long sys_bdflush(int func, long data)
2904 {
2905 if (!capable(CAP_SYS_ADMIN))
2906 return -EPERM;
2907
2908 if (func == 1) {
2909 /* do_exit directly and let kupdate to do its work alone. */
2910 do_exit(0);
2911 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2912 a syscall that doesn't care about the current mm context. */
2913 int error;
2914 struct mm_struct *user_mm;
2915
2916 /*
2917 * bdflush will spend all of it's time in kernel-space,
2918 * without touching user-space, so we can switch it into
2919 * 'lazy TLB mode' to reduce the cost of context-switches
2920 * to and from bdflush.
2921 */
2922 user_mm = start_lazy_tlb();
2923 error = sync_old_buffers();
2924 end_lazy_tlb(user_mm);
2925 return error;
2926 #endif
2927 }
2928
2929 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2930 if (func >= 2) {
2931 int i = (func-2) >> 1;
2932 if (i >= 0 && i < N_PARAM) {
2933 if ((func & 1) == 0)
2934 return put_user(bdf_prm.data[i], (int*)data);
2935
2936 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2937 bdf_prm.data[i] = data;
2938 return 0;
2939 }
2940 }
2941 return -EINVAL;
2942 }
2943
2944 /* Having func 0 used to launch the actual bdflush and then never
2945 * return (unless explicitly killed). We return zero here to
2946 * remain semi-compatible with present update(8) programs.
2947 */
2948 return 0;
2949 }
2950
2951 /*
2952 * This is the actual bdflush daemon itself. It used to be started from
2953 * the syscall above, but now we launch it ourselves internally with
2954 * kernel_thread(...) directly after the first thread in init/main.c
2955 */
2956 int bdflush(void *startup)
2957 {
2958 struct task_struct *tsk = current;
2959
2960 /*
2961 * We have a bare-bones task_struct, and really should fill
2962 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2963 * display semi-sane things. Not real crucial though...
2964 */
2965
2966 tsk->session = 1;
2967 tsk->pgrp = 1;
2968 strcpy(tsk->comm, "bdflush");
2969
2970 /* avoid getting signals */
2971 spin_lock_irq(&tsk->sigmask_lock);
2972 flush_signals(tsk);
2973 sigfillset(&tsk->blocked);
2974 recalc_sigpending(tsk);
2975 spin_unlock_irq(&tsk->sigmask_lock);
2976
2977 complete((struct completion *)startup);
2978
2979 /*
2980 * FIXME: The ndirty logic here is wrong. It's supposed to
2981 * send bdflush back to sleep after writing ndirty buffers.
2982 * In fact, the test is wrong so bdflush will in fact
2983 * sleep when bdflush_stop() returns true.
2984 *
2985 * FIXME: If it proves useful to implement ndirty properly,
2986 * then perhaps the value of ndirty should be scaled by the
2987 * amount of memory in the machine.
2988 */
2989 for (;;) {
2990 int ndirty = bdf_prm.b_un.ndirty;
2991
2992 CHECK_EMERGENCY_SYNC
2993
2994 while (ndirty > 0) {
2995 spin_lock(&lru_list_lock);
2996 if (!write_some_buffers(NODEV))
2997 break;
2998 ndirty -= NRSYNC;
2999 }
3000 if (ndirty > 0 || bdflush_stop())
3001 interruptible_sleep_on(&bdflush_wait);
3002 }
3003 }
3004
3005 /*
3006 * This is the kernel update daemon. It was used to live in userspace
3007 * but since it's need to run safely we want it unkillable by mistake.
3008 * You don't need to change your userspace configuration since
3009 * the userspace `update` will do_exit(0) at the first sys_bdflush().
3010 */
3011 int kupdate(void *startup)
3012 {
3013 struct task_struct * tsk = current;
3014 int interval;
3015
3016 tsk->session = 1;
3017 tsk->pgrp = 1;
3018 strcpy(tsk->comm, "kupdated");
3019
3020 /* sigstop and sigcont will stop and wakeup kupdate */
3021 spin_lock_irq(&tsk->sigmask_lock);
3022 sigfillset(&tsk->blocked);
3023 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3024 recalc_sigpending(tsk);
3025 spin_unlock_irq(&tsk->sigmask_lock);
3026
3027 complete((struct completion *)startup);
3028
3029 for (;;) {
3030 /* update interval */
3031 interval = bdf_prm.b_un.interval;
3032 if (interval) {
3033 tsk->state = TASK_INTERRUPTIBLE;
3034 schedule_timeout(interval);
3035 } else {
3036 stop_kupdate:
3037 tsk->state = TASK_STOPPED;
3038 schedule(); /* wait for SIGCONT */
3039 }
3040 /* check for sigstop */
3041 if (signal_pending(tsk)) {
3042 int stopped = 0;
3043 spin_lock_irq(&tsk->sigmask_lock);
3044 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
3045 sigdelset(&tsk->pending.signal, SIGSTOP);
3046 stopped = 1;
3047 }
3048 recalc_sigpending(tsk);
3049 spin_unlock_irq(&tsk->sigmask_lock);
3050 if (stopped)
3051 goto stop_kupdate;
3052 }
3053 #ifdef DEBUG
3054 printk(KERN_DEBUG "kupdate() activated...\n");
3055 #endif
3056 sync_old_buffers();
3057 run_task_queue(&tq_disk);
3058 }
3059 }
3060
3061 static int __init bdflush_init(void)
3062 {
3063 static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3064
3065 kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3066 wait_for_completion(&startup);
3067 kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3068 wait_for_completion(&startup);
3069 return 0;
3070 }
3071
3072 module_init(bdflush_init)
3073
Cache object: e8e3a1a62c364fb0b8111c3e4287827f
|