The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/fs/buffer.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *  linux/fs/buffer.c
    3  *
    4  *  Copyright (C) 1991, 1992  Linus Torvalds
    5  */
    6 
    7 /*
    8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
    9  * been avoided by NEVER letting an interrupt change a buffer (except for the
   10  * data, of course), but instead letting the caller do it.
   11  */
   12 
   13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
   14 
   15 /* Removed a lot of unnecessary code and simplified things now that
   16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
   17  */
   18 
   19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
   20  * hash table, use SLAB cache for buffer heads. -DaveM
   21  */
   22 
   23 /* Added 32k buffer block sizes - these are required older ARM systems.
   24  * - RMK
   25  */
   26 
   27 /* Thread it... -DaveM */
   28 
   29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
   30 
   31 #include <linux/config.h>
   32 #include <linux/sched.h>
   33 #include <linux/fs.h>
   34 #include <linux/slab.h>
   35 #include <linux/locks.h>
   36 #include <linux/errno.h>
   37 #include <linux/swap.h>
   38 #include <linux/swapctl.h>
   39 #include <linux/smp_lock.h>
   40 #include <linux/vmalloc.h>
   41 #include <linux/blkdev.h>
   42 #include <linux/sysrq.h>
   43 #include <linux/file.h>
   44 #include <linux/init.h>
   45 #include <linux/quotaops.h>
   46 #include <linux/iobuf.h>
   47 #include <linux/highmem.h>
   48 #include <linux/module.h>
   49 #include <linux/completion.h>
   50 
   51 #include <asm/uaccess.h>
   52 #include <asm/io.h>
   53 #include <asm/bitops.h>
   54 #include <asm/mmu_context.h>
   55 
   56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
   57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
   58                                              number of unused buffer heads */
   59 
   60 /* Anti-deadlock ordering:
   61  *      lru_list_lock > hash_table_lock > unused_list_lock
   62  */
   63 
   64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
   65 
   66 /*
   67  * Hash table gook..
   68  */
   69 static unsigned int bh_hash_mask;
   70 static unsigned int bh_hash_shift;
   71 static struct buffer_head **hash_table;
   72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
   73 
   74 static struct buffer_head *lru_list[NR_LIST];
   75 
   76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
   77 #define lru_list_lock  lru_list_lock_cacheline.lock
   78 
   79 static int nr_buffers_type[NR_LIST];
   80 static unsigned long size_buffers_type[NR_LIST];
   81 
   82 static struct buffer_head * unused_list;
   83 static int nr_unused_buffer_heads;
   84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
   85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
   86 
   87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
   88 static int osync_buffers_list(struct list_head *);
   89 static void __refile_buffer(struct buffer_head *);
   90 
   91 /* This is used by some architectures to estimate available memory. */
   92 atomic_t buffermem_pages = ATOMIC_INIT(0);
   93 
   94 /* Here is the parameter block for the bdflush process. If you add or
   95  * remove any of the parameters, make sure to update kernel/sysctl.c
   96  * and the documentation at linux/Documentation/sysctl/vm.txt.
   97  */
   98 
   99 #define N_PARAM 9
  100 
  101 /* The dummy values in this structure are left in there for compatibility
  102  * with old programs that play with the /proc entries.
  103  */
  104 union bdflush_param {
  105         struct {
  106                 int nfract;     /* Percentage of buffer cache dirty to 
  107                                    activate bdflush */
  108                 int ndirty;     /* Maximum number of dirty blocks to write out per
  109                                    wake-cycle */
  110                 int dummy2;     /* old "nrefill" */
  111                 int dummy3;     /* unused */
  112                 int interval;   /* jiffies delay between kupdate flushes */
  113                 int age_buffer; /* Time for normal buffer to age before we flush it */
  114                 int nfract_sync;/* Percentage of buffer cache dirty to 
  115                                    activate bdflush synchronously */
  116                 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
  117                 int dummy5;     /* unused */
  118         } b_un;
  119         unsigned int data[N_PARAM];
  120 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
  121 
  122 /* These are the min and max parameter values that we will allow to be assigned */
  123 int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
  124 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
  125 
  126 void unlock_buffer(struct buffer_head *bh)
  127 {
  128         clear_bit(BH_Wait_IO, &bh->b_state);
  129         clear_bit(BH_Launder, &bh->b_state);
  130         /*
  131          * When a locked buffer is visible to the I/O layer BH_Launder
  132          * is set. This means before unlocking we must clear BH_Launder,
  133          * mb() on alpha and then clear BH_Lock, so no reader can see
  134          * BH_Launder set on an unlocked buffer and then risk to deadlock.
  135          */
  136         smp_mb__after_clear_bit();
  137         clear_bit(BH_Lock, &bh->b_state);
  138         smp_mb__after_clear_bit();
  139         if (waitqueue_active(&bh->b_wait))
  140                 wake_up(&bh->b_wait);
  141 }
  142 
  143 /*
  144  * Note that the real wait_on_buffer() is an inline function that checks
  145  * that the buffer is locked before calling this, so that unnecessary disk
  146  * unplugging does not occur.
  147  */
  148 void __wait_on_buffer(struct buffer_head * bh)
  149 {
  150         struct task_struct *tsk = current;
  151         DECLARE_WAITQUEUE(wait, tsk);
  152 
  153         get_bh(bh);
  154         add_wait_queue(&bh->b_wait, &wait);
  155         do {
  156                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  157                 if (!buffer_locked(bh))
  158                         break;
  159                 /*
  160                  * We must read tq_disk in TQ_ACTIVE after the
  161                  * add_wait_queue effect is visible to other cpus.
  162                  * We could unplug some line above it wouldn't matter
  163                  * but we can't do that right after add_wait_queue
  164                  * without an smp_mb() in between because spin_unlock
  165                  * has inclusive semantics.
  166                  * Doing it here is the most efficient place so we
  167                  * don't do a suprious unplug if we get a racy
  168                  * wakeup that make buffer_locked to return 0, and
  169                  * doing it here avoids an explicit smp_mb() we
  170                  * rely on the implicit one in set_task_state.
  171                  */
  172                 run_task_queue(&tq_disk);
  173                 schedule();
  174         } while (buffer_locked(bh));
  175         tsk->state = TASK_RUNNING;
  176         remove_wait_queue(&bh->b_wait, &wait);
  177         put_bh(bh);
  178 }
  179 
  180 /*
  181  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
  182  * unlock the buffer. This is what ll_rw_block uses too.
  183  */
  184 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  185 {
  186         mark_buffer_uptodate(bh, uptodate);
  187         unlock_buffer(bh);
  188         put_bh(bh);
  189 }
  190 
  191 /*
  192  * The buffers have been marked clean and locked.  Just submit the dang
  193  * things.. 
  194  */
  195 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
  196 {
  197         do {
  198                 struct buffer_head * bh = *array++;
  199                 bh->b_end_io = end_buffer_io_sync;
  200                 submit_bh(WRITE, bh);
  201         } while (--count);
  202 }
  203 
  204 /*
  205  * Write some buffers from the head of the dirty queue.
  206  *
  207  * This must be called with the LRU lock held, and will
  208  * return without it!
  209  */
  210 #define NRSYNC (32)
  211 static int write_some_buffers(kdev_t dev)
  212 {
  213         struct buffer_head *next;
  214         struct buffer_head *array[NRSYNC];
  215         unsigned int count;
  216         int nr;
  217 
  218         next = lru_list[BUF_DIRTY];
  219         nr = nr_buffers_type[BUF_DIRTY];
  220         count = 0;
  221         while (next && --nr >= 0) {
  222                 struct buffer_head * bh = next;
  223                 next = bh->b_next_free;
  224 
  225                 if (dev != NODEV && bh->b_dev != dev)
  226                         continue;
  227                 if (test_and_set_bit(BH_Lock, &bh->b_state))
  228                         continue;
  229                 if (atomic_set_buffer_clean(bh)) {
  230                         __refile_buffer(bh);
  231                         get_bh(bh);
  232                         array[count++] = bh;
  233                         if (count < NRSYNC)
  234                                 continue;
  235 
  236                         spin_unlock(&lru_list_lock);
  237                         write_locked_buffers(array, count);
  238                         return -EAGAIN;
  239                 }
  240                 unlock_buffer(bh);
  241                 __refile_buffer(bh);
  242         }
  243         spin_unlock(&lru_list_lock);
  244 
  245         if (count)
  246                 write_locked_buffers(array, count);
  247         return 0;
  248 }
  249 
  250 /*
  251  * Write out all buffers on the dirty list.
  252  */
  253 static void write_unlocked_buffers(kdev_t dev)
  254 {
  255         do
  256                 spin_lock(&lru_list_lock);
  257         while (write_some_buffers(dev));
  258 }
  259 
  260 /*
  261  * Wait for a buffer on the proper list.
  262  *
  263  * This must be called with the LRU lock held, and
  264  * will return with it released.
  265  */
  266 static int wait_for_buffers(kdev_t dev, int index, int refile)
  267 {
  268         struct buffer_head * next;
  269         int nr;
  270 
  271         next = lru_list[index];
  272         nr = nr_buffers_type[index];
  273         while (next && --nr >= 0) {
  274                 struct buffer_head *bh = next;
  275                 next = bh->b_next_free;
  276 
  277                 if (!buffer_locked(bh)) {
  278                         if (refile)
  279                                 __refile_buffer(bh);
  280                         continue;
  281                 }
  282                 if (dev != NODEV && bh->b_dev != dev)
  283                         continue;
  284 
  285                 get_bh(bh);
  286                 spin_unlock(&lru_list_lock);
  287                 wait_on_buffer (bh);
  288                 put_bh(bh);
  289                 return -EAGAIN;
  290         }
  291         spin_unlock(&lru_list_lock);
  292         return 0;
  293 }
  294 
  295 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
  296 {
  297         do {
  298                 spin_lock(&lru_list_lock);
  299         } while (wait_for_buffers(dev, index, refile));
  300         return 0;
  301 }
  302 
  303 /* Call sync_buffers with wait!=0 to ensure that the call does not
  304  * return until all buffer writes have completed.  Sync() may return
  305  * before the writes have finished; fsync() may not.
  306  */
  307 
  308 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
  309  * spontaneously dirty themselves without ever brelse being called.
  310  * We will ultimately want to put these in a separate list, but for
  311  * now we search all of the lists for dirty buffers.
  312  */
  313 int sync_buffers(kdev_t dev, int wait)
  314 {
  315         int err = 0;
  316 
  317         /* One pass for no-wait, three for wait:
  318          * 0) write out all dirty, unlocked buffers;
  319          * 1) wait for all dirty locked buffers;
  320          * 2) write out all dirty, unlocked buffers;
  321          * 2) wait for completion by waiting for all buffers to unlock.
  322          */
  323         write_unlocked_buffers(dev);
  324         if (wait) {
  325                 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
  326                 write_unlocked_buffers(dev);
  327                 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
  328         }
  329         return err;
  330 }
  331 
  332 int fsync_super(struct super_block *sb)
  333 {
  334         kdev_t dev = sb->s_dev;
  335         sync_buffers(dev, 0);
  336 
  337         lock_kernel();
  338         sync_inodes_sb(sb);
  339         DQUOT_SYNC_SB(sb);
  340         lock_super(sb);
  341         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
  342                 sb->s_op->write_super(sb);
  343         unlock_super(sb);
  344         if (sb->s_op && sb->s_op->sync_fs)
  345                 sb->s_op->sync_fs(sb);
  346         unlock_kernel();
  347 
  348         return sync_buffers(dev, 1);
  349 }
  350 
  351 int fsync_no_super(kdev_t dev)
  352 {
  353         sync_buffers(dev, 0);
  354         return sync_buffers(dev, 1);
  355 }
  356 
  357 int fsync_dev(kdev_t dev)
  358 {
  359         sync_buffers(dev, 0);
  360 
  361         lock_kernel();
  362         sync_inodes(dev);
  363         DQUOT_SYNC_DEV(dev);
  364         sync_supers(dev, 1);
  365         unlock_kernel();
  366 
  367         return sync_buffers(dev, 1);
  368 }
  369 
  370 /*
  371  * There's no real reason to pretend we should
  372  * ever do anything differently
  373  */
  374 void sync_dev(kdev_t dev)
  375 {
  376         fsync_dev(dev);
  377 }
  378 
  379 asmlinkage long sys_sync(void)
  380 {
  381         fsync_dev(0);
  382         return 0;
  383 }
  384 
  385 /*
  386  *      filp may be NULL if called via the msync of a vma.
  387  */
  388  
  389 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
  390 {
  391         struct inode * inode = dentry->d_inode;
  392         struct super_block * sb;
  393         kdev_t dev;
  394         int ret;
  395 
  396         lock_kernel();
  397         /* sync the inode to buffers */
  398         write_inode_now(inode, 0);
  399 
  400         /* sync the superblock to buffers */
  401         sb = inode->i_sb;
  402         lock_super(sb);
  403         if (sb->s_op && sb->s_op->write_super)
  404                 sb->s_op->write_super(sb);
  405         unlock_super(sb);
  406 
  407         /* .. finally sync the buffers to disk */
  408         dev = inode->i_dev;
  409         ret = sync_buffers(dev, 1);
  410         unlock_kernel();
  411         return ret;
  412 }
  413 
  414 asmlinkage long sys_fsync(unsigned int fd)
  415 {
  416         struct file * file;
  417         struct dentry * dentry;
  418         struct inode * inode;
  419         int ret, err;
  420 
  421         ret = -EBADF;
  422         file = fget(fd);
  423         if (!file)
  424                 goto out;
  425 
  426         dentry = file->f_dentry;
  427         inode = dentry->d_inode;
  428 
  429         ret = -EINVAL;
  430         if (!file->f_op || !file->f_op->fsync) {
  431                 /* Why?  We can still call filemap_fdatasync */
  432                 goto out_putf;
  433         }
  434 
  435         /* We need to protect against concurrent writers.. */
  436         down(&inode->i_sem);
  437         ret = filemap_fdatasync(inode->i_mapping);
  438         err = file->f_op->fsync(file, dentry, 0);
  439         if (err && !ret)
  440                 ret = err;
  441         err = filemap_fdatawait(inode->i_mapping);
  442         if (err && !ret)
  443                 ret = err;
  444         up(&inode->i_sem);
  445 
  446 out_putf:
  447         fput(file);
  448 out:
  449         return ret;
  450 }
  451 
  452 int do_fdatasync(struct file *file)
  453 {
  454         int ret, err;
  455         struct dentry *dentry;
  456         struct inode *inode;
  457 
  458         if (unlikely(!file->f_op || !file->f_op->fsync))
  459                 return -EINVAL;
  460         
  461         dentry = file->f_dentry;
  462         inode = dentry->d_inode;
  463 
  464         ret = filemap_fdatasync(inode->i_mapping);
  465         err = file->f_op->fsync(file, dentry, 1);
  466         if (err && !ret)
  467                 ret = err;
  468         err = filemap_fdatawait(inode->i_mapping);
  469         if (err && !ret)
  470                 ret = err;
  471         return ret;
  472 }
  473 
  474 asmlinkage long sys_fdatasync(unsigned int fd)
  475 {
  476         struct file * file;
  477         struct inode *inode;
  478         int ret;
  479 
  480         ret = -EBADF;
  481         file = fget(fd);
  482         if (!file)
  483                 goto out;
  484 
  485         inode = file->f_dentry->d_inode;
  486         down(&inode->i_sem);
  487         ret = do_fdatasync(file);
  488         up(&inode->i_sem);
  489 
  490         fput(file);
  491 out:
  492         return ret;
  493 }
  494 
  495 /* After several hours of tedious analysis, the following hash
  496  * function won.  Do not mess with it... -DaveM
  497  */
  498 #define _hashfn(dev,block)      \
  499         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
  500          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
  501           ((block) << (bh_hash_shift - 12))))
  502 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
  503 
  504 static inline void __insert_into_hash_list(struct buffer_head *bh)
  505 {
  506         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
  507         struct buffer_head *next = *head;
  508 
  509         *head = bh;
  510         bh->b_pprev = head;
  511         bh->b_next = next;
  512         if (next != NULL)
  513                 next->b_pprev = &bh->b_next;
  514 }
  515 
  516 static __inline__ void __hash_unlink(struct buffer_head *bh)
  517 {
  518         struct buffer_head **pprev = bh->b_pprev;
  519         if (pprev) {
  520                 struct buffer_head *next = bh->b_next;
  521                 if (next)
  522                         next->b_pprev = pprev;
  523                 *pprev = next;
  524                 bh->b_pprev = NULL;
  525         }
  526 }
  527 
  528 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
  529 {
  530         struct buffer_head **bhp = &lru_list[blist];
  531 
  532         if (bh->b_prev_free || bh->b_next_free) BUG();
  533 
  534         if(!*bhp) {
  535                 *bhp = bh;
  536                 bh->b_prev_free = bh;
  537         }
  538         bh->b_next_free = *bhp;
  539         bh->b_prev_free = (*bhp)->b_prev_free;
  540         (*bhp)->b_prev_free->b_next_free = bh;
  541         (*bhp)->b_prev_free = bh;
  542         nr_buffers_type[blist]++;
  543         size_buffers_type[blist] += bh->b_size;
  544 }
  545 
  546 static void __remove_from_lru_list(struct buffer_head * bh)
  547 {
  548         struct buffer_head *next = bh->b_next_free;
  549         if (next) {
  550                 struct buffer_head *prev = bh->b_prev_free;
  551                 int blist = bh->b_list;
  552 
  553                 prev->b_next_free = next;
  554                 next->b_prev_free = prev;
  555                 if (lru_list[blist] == bh) {
  556                         if (next == bh)
  557                                 next = NULL;
  558                         lru_list[blist] = next;
  559                 }
  560                 bh->b_next_free = NULL;
  561                 bh->b_prev_free = NULL;
  562                 nr_buffers_type[blist]--;
  563                 size_buffers_type[blist] -= bh->b_size;
  564         }
  565 }
  566 
  567 /* must be called with both the hash_table_lock and the lru_list_lock
  568    held */
  569 static void __remove_from_queues(struct buffer_head *bh)
  570 {
  571         __hash_unlink(bh);
  572         __remove_from_lru_list(bh);
  573 }
  574 
  575 static void remove_from_queues(struct buffer_head *bh)
  576 {
  577         spin_lock(&lru_list_lock);
  578         write_lock(&hash_table_lock);
  579         __remove_from_queues(bh);
  580         write_unlock(&hash_table_lock); 
  581         spin_unlock(&lru_list_lock);
  582 }
  583 
  584 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
  585 {
  586         struct buffer_head *bh, **p = &hash(dev, block);
  587 
  588         read_lock(&hash_table_lock);
  589 
  590         for (;;) {
  591                 bh = *p;
  592                 if (!bh)
  593                         break;
  594                 p = &bh->b_next;
  595                 if (bh->b_blocknr != block)
  596                         continue;
  597                 if (bh->b_size != size)
  598                         continue;
  599                 if (bh->b_dev != dev)
  600                         continue;
  601                 get_bh(bh);
  602                 break;
  603         }
  604 
  605         read_unlock(&hash_table_lock);
  606         return bh;
  607 }
  608 
  609 void buffer_insert_list(struct buffer_head *bh, struct list_head *list)
  610 {
  611         spin_lock(&lru_list_lock);
  612         if (buffer_attached(bh))
  613                 list_del(&bh->b_inode_buffers);
  614         set_buffer_attached(bh);
  615         list_add(&bh->b_inode_buffers, list);
  616         spin_unlock(&lru_list_lock);
  617 }
  618 
  619 /*
  620  * The caller must have the lru_list lock before calling the 
  621  * remove_inode_queue functions.
  622  */
  623 static void __remove_inode_queue(struct buffer_head *bh)
  624 {
  625         list_del(&bh->b_inode_buffers);
  626         clear_buffer_attached(bh);
  627 }
  628 
  629 static inline void remove_inode_queue(struct buffer_head *bh)
  630 {
  631         if (buffer_attached(bh))
  632                 __remove_inode_queue(bh);
  633 }
  634 
  635 int inode_has_buffers(struct inode *inode)
  636 {
  637         int ret;
  638         
  639         spin_lock(&lru_list_lock);
  640         ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
  641         spin_unlock(&lru_list_lock);
  642         
  643         return ret;
  644 }
  645 
  646 /* If invalidate_buffers() will trash dirty buffers, it means some kind
  647    of fs corruption is going on. Trashing dirty data always imply losing
  648    information that was supposed to be just stored on the physical layer
  649    by the user.
  650 
  651    Thus invalidate_buffers in general usage is not allwowed to trash
  652    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
  653    be preserved.  These buffers are simply skipped.
  654   
  655    We also skip buffers which are still in use.  For example this can
  656    happen if a userspace program is reading the block device.
  657 
  658    NOTE: In the case where the user removed a removable-media-disk even if
  659    there's still dirty data not synced on disk (due a bug in the device driver
  660    or due an error of the user), by not destroying the dirty buffers we could
  661    generate corruption also on the next media inserted, thus a parameter is
  662    necessary to handle this case in the most safe way possible (trying
  663    to not corrupt also the new disk inserted with the data belonging to
  664    the old now corrupted disk). Also for the ramdisk the natural thing
  665    to do in order to release the ramdisk memory is to destroy dirty buffers.
  666 
  667    These are two special cases. Normal usage imply the device driver
  668    to issue a sync on the device (without waiting I/O completion) and
  669    then an invalidate_buffers call that doesn't trash dirty buffers.
  670 
  671    For handling cache coherency with the blkdev pagecache the 'update' case
  672    is been introduced. It is needed to re-read from disk any pinned
  673    buffer. NOTE: re-reading from disk is destructive so we can do it only
  674    when we assume nobody is changing the buffercache under our I/O and when
  675    we think the disk contains more recent information than the buffercache.
  676    The update == 1 pass marks the buffers we need to update, the update == 2
  677    pass does the actual I/O. */
  678 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
  679 {
  680         int i, nlist, slept;
  681         struct buffer_head * bh, * bh_next;
  682         kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
  683 
  684  retry:
  685         slept = 0;
  686         spin_lock(&lru_list_lock);
  687         for(nlist = 0; nlist < NR_LIST; nlist++) {
  688                 bh = lru_list[nlist];
  689                 if (!bh)
  690                         continue;
  691                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
  692                         bh_next = bh->b_next_free;
  693 
  694                         /* Another device? */
  695                         if (bh->b_dev != dev)
  696                                 continue;
  697                         /* Not hashed? */
  698                         if (!bh->b_pprev)
  699                                 continue;
  700                         if (buffer_locked(bh)) {
  701                                 get_bh(bh);
  702                                 spin_unlock(&lru_list_lock);
  703                                 wait_on_buffer(bh);
  704                                 slept = 1;
  705                                 spin_lock(&lru_list_lock);
  706                                 put_bh(bh);
  707                         }
  708 
  709                         write_lock(&hash_table_lock);
  710                         /* All buffers in the lru lists are mapped */
  711                         if (!buffer_mapped(bh))
  712                                 BUG();
  713                         if (buffer_dirty(bh) && destroy_dirty_buffers)
  714                                 printk("invalidate: dirty buffer\n");
  715                         if (!atomic_read(&bh->b_count)) {
  716                                 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
  717                                         remove_inode_queue(bh);
  718                                 }
  719                         } else if (!bdev->bd_openers)
  720                                 printk("invalidate: busy buffer\n");
  721 
  722                         write_unlock(&hash_table_lock);
  723                         if (slept)
  724                                 goto out;
  725                 }
  726         }
  727 out:
  728         spin_unlock(&lru_list_lock);
  729         if (slept)
  730                 goto retry;
  731 
  732         /* Get rid of the page cache */
  733         invalidate_inode_pages(bdev->bd_inode);
  734 }
  735 
  736 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
  737 {
  738         struct block_device *bdev = bdget(dev);
  739         if (bdev) {
  740                 invalidate_bdev(bdev, destroy_dirty_buffers);
  741                 bdput(bdev);
  742         }
  743 }
  744 
  745 static void free_more_memory(void)
  746 {
  747         balance_dirty();
  748         wakeup_bdflush();
  749         try_to_free_pages(GFP_NOIO);
  750         run_task_queue(&tq_disk);
  751         yield();
  752 }
  753 
  754 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  755 {
  756         bh->b_list = BUF_CLEAN;
  757         bh->b_end_io = handler;
  758         bh->b_private = private;
  759 }
  760 
  761 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
  762 {
  763         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
  764         unsigned long flags;
  765         struct buffer_head *tmp;
  766         struct page *page;
  767         int fullup = 1;
  768 
  769         mark_buffer_uptodate(bh, uptodate);
  770 
  771         /* This is a temporary buffer used for page I/O. */
  772         page = bh->b_page;
  773 
  774         if (!uptodate)
  775                 SetPageError(page);
  776 
  777         /*
  778          * Be _very_ careful from here on. Bad things can happen if
  779          * two buffer heads end IO at almost the same time and both
  780          * decide that the page is now completely done.
  781          *
  782          * Async buffer_heads are here only as labels for IO, and get
  783          * thrown away once the IO for this page is complete.  IO is
  784          * deemed complete once all buffers have been visited
  785          * (b_count==0) and are now unlocked. We must make sure that
  786          * only the _last_ buffer that decrements its count is the one
  787          * that unlock the page..
  788          */
  789         spin_lock_irqsave(&page_uptodate_lock, flags);
  790         mark_buffer_async(bh, 0);
  791         unlock_buffer(bh);
  792         tmp = bh->b_this_page;
  793         while (tmp != bh) {
  794                 if (buffer_locked(tmp)) {
  795                         if (buffer_async(tmp))
  796                                 goto still_busy;
  797                 } else if (!buffer_uptodate(tmp))
  798                         fullup = 0;
  799                 tmp = tmp->b_this_page;
  800         }
  801 
  802         /* OK, the async IO on this page is complete. */
  803         spin_unlock_irqrestore(&page_uptodate_lock, flags);
  804 
  805         /*
  806          * If none of the buffers had errors and all were uptodate
  807          * then we can set the page uptodate:
  808          */
  809         if (fullup && !PageError(page))
  810                 SetPageUptodate(page);
  811 
  812         UnlockPage(page);
  813 
  814         return;
  815 
  816 still_busy:
  817         spin_unlock_irqrestore(&page_uptodate_lock, flags);
  818         return;
  819 }
  820 
  821 inline void set_buffer_async_io(struct buffer_head *bh)
  822 {
  823         bh->b_end_io = end_buffer_io_async;
  824         mark_buffer_async(bh, 1);
  825 }
  826 
  827 /*
  828  * Synchronise all the inode's dirty buffers to the disk.
  829  *
  830  * We have conflicting pressures: we want to make sure that all
  831  * initially dirty buffers get waited on, but that any subsequently
  832  * dirtied buffers don't.  After all, we don't want fsync to last
  833  * forever if somebody is actively writing to the file.
  834  *
  835  * Do this in two main stages: first we copy dirty buffers to a
  836  * temporary inode list, queueing the writes as we go.  Then we clean
  837  * up, waiting for those writes to complete.
  838  * 
  839  * During this second stage, any subsequent updates to the file may end
  840  * up refiling the buffer on the original inode's dirty list again, so
  841  * there is a chance we will end up with a buffer queued for write but
  842  * not yet completed on that list.  So, as a final cleanup we go through
  843  * the osync code to catch these locked, dirty buffers without requeuing
  844  * any newly dirty buffers for write.
  845  */
  846 int fsync_buffers_list(struct list_head *list)
  847 {
  848         struct buffer_head *bh;
  849         struct list_head tmp;
  850         int err = 0, err2;
  851         
  852         INIT_LIST_HEAD(&tmp);
  853         
  854         spin_lock(&lru_list_lock);
  855 
  856         while (!list_empty(list)) {
  857                 bh = BH_ENTRY(list->next);
  858                 list_del(&bh->b_inode_buffers);
  859                 if (!buffer_dirty(bh) && !buffer_locked(bh))
  860                         clear_buffer_attached(bh);
  861                 else {
  862                         set_buffer_attached(bh);
  863                         list_add(&bh->b_inode_buffers, &tmp);
  864                         if (buffer_dirty(bh)) {
  865                                 get_bh(bh);
  866                                 spin_unlock(&lru_list_lock);
  867                         /*
  868                          * Wait I/O completion before submitting
  869                          * the buffer, to be sure the write will
  870                          * be effective on the latest data in
  871                          * the buffer. (otherwise - if there's old
  872                          * I/O in flight - write_buffer would become
  873                          * a noop)
  874                          */
  875                                 wait_on_buffer(bh);
  876                                 ll_rw_block(WRITE, 1, &bh);
  877                                 brelse(bh);
  878                                 spin_lock(&lru_list_lock);
  879                         }
  880                 }
  881         }
  882 
  883         while (!list_empty(&tmp)) {
  884                 bh = BH_ENTRY(tmp.prev);
  885                 remove_inode_queue(bh);
  886                 get_bh(bh);
  887                 spin_unlock(&lru_list_lock);
  888                 wait_on_buffer(bh);
  889                 if (!buffer_uptodate(bh))
  890                         err = -EIO;
  891                 brelse(bh);
  892                 spin_lock(&lru_list_lock);
  893         }
  894         
  895         spin_unlock(&lru_list_lock);
  896         err2 = osync_buffers_list(list);
  897 
  898         if (err)
  899                 return err;
  900         else
  901                 return err2;
  902 }
  903 
  904 /*
  905  * osync is designed to support O_SYNC io.  It waits synchronously for
  906  * all already-submitted IO to complete, but does not queue any new
  907  * writes to the disk.
  908  *
  909  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
  910  * you dirty the buffers, and then use osync_buffers_list to wait for
  911  * completion.  Any other dirty buffers which are not yet queued for
  912  * write will not be flushed to disk by the osync.
  913  */
  914 static int osync_buffers_list(struct list_head *list)
  915 {
  916         struct buffer_head *bh;
  917         struct list_head *p;
  918         int err = 0;
  919 
  920         spin_lock(&lru_list_lock);
  921         
  922  repeat:
  923         list_for_each_prev(p, list) {
  924                 bh = BH_ENTRY(p);
  925                 if (buffer_locked(bh)) {
  926                         get_bh(bh);
  927                         spin_unlock(&lru_list_lock);
  928                         wait_on_buffer(bh);
  929                         if (!buffer_uptodate(bh))
  930                                 err = -EIO;
  931                         brelse(bh);
  932                         spin_lock(&lru_list_lock);
  933                         goto repeat;
  934                 }
  935         }
  936 
  937         spin_unlock(&lru_list_lock);
  938         return err;
  939 }
  940 
  941 /*
  942  * Invalidate any and all dirty buffers on a given inode.  We are
  943  * probably unmounting the fs, but that doesn't mean we have already
  944  * done a sync().  Just drop the buffers from the inode list.
  945  */
  946 void invalidate_inode_buffers(struct inode *inode)
  947 {
  948         struct list_head * entry;
  949         
  950         spin_lock(&lru_list_lock);
  951         while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
  952                 remove_inode_queue(BH_ENTRY(entry));
  953         while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
  954                 remove_inode_queue(BH_ENTRY(entry));
  955         spin_unlock(&lru_list_lock);
  956 }
  957 
  958 
  959 /*
  960  * Ok, this is getblk, and it isn't very clear, again to hinder
  961  * race-conditions. Most of the code is seldom used, (ie repeating),
  962  * so it should be much more efficient than it looks.
  963  *
  964  * The algorithm is changed: hopefully better, and an elusive bug removed.
  965  *
  966  * 14.02.92: changed it to sync dirty buffers a bit: better performance
  967  * when the filesystem starts to get full of dirty blocks (I hope).
  968  */
  969 struct buffer_head * getblk(kdev_t dev, int block, int size)
  970 {
  971         for (;;) {
  972                 struct buffer_head * bh;
  973 
  974                 bh = get_hash_table(dev, block, size);
  975                 if (bh) {
  976                         touch_buffer(bh);
  977                         return bh;
  978                 }
  979 
  980                 if (!grow_buffers(dev, block, size))
  981                         free_more_memory();
  982         }
  983 }
  984 
  985 /* -1 -> no need to flush
  986     0 -> async flush
  987     1 -> sync flush (wait for I/O completion) */
  988 static int balance_dirty_state(void)
  989 {
  990         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
  991 
  992         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
  993         tot = nr_free_buffer_pages();
  994 
  995         dirty *= 100;
  996         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
  997         hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
  998 
  999         /* First, check for the "real" dirty limit. */
 1000         if (dirty > soft_dirty_limit) {
 1001                 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
 1002                         return 1;
 1003                 return 0;
 1004         }
 1005 
 1006         return -1;
 1007 }
 1008 
 1009 static int bdflush_stop(void)
 1010 {
 1011         unsigned long dirty, tot, dirty_limit;
 1012 
 1013         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 1014         tot = nr_free_buffer_pages();
 1015 
 1016         dirty *= 100;
 1017         dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
 1018 
 1019         if (dirty > dirty_limit)
 1020                 return 0;
 1021         return 1;
 1022 }
 1023 
 1024 /*
 1025  * if a new dirty buffer is created we need to balance bdflush.
 1026  *
 1027  * in the future we might want to make bdflush aware of different
 1028  * pressures on different devices - thus the (currently unused)
 1029  * 'dev' parameter.
 1030  */
 1031 void balance_dirty(void)
 1032 {
 1033         int state = balance_dirty_state();
 1034 
 1035         if (state < 0)
 1036                 return;
 1037 
 1038         wakeup_bdflush();
 1039 
 1040         /*
 1041          * And if we're _really_ out of balance, wait for
 1042          * some of the dirty/locked buffers ourselves.
 1043          * This will throttle heavy writers.
 1044          */
 1045         if (state > 0) {
 1046                 spin_lock(&lru_list_lock);
 1047                 write_some_buffers(NODEV);
 1048         }
 1049 }
 1050 EXPORT_SYMBOL(balance_dirty);
 1051 
 1052 inline void __mark_dirty(struct buffer_head *bh)
 1053 {
 1054         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 1055         refile_buffer(bh);
 1056 }
 1057 
 1058 /* atomic version, the user must call balance_dirty() by hand
 1059    as soon as it become possible to block */
 1060 void __mark_buffer_dirty(struct buffer_head *bh)
 1061 {
 1062         if (!atomic_set_buffer_dirty(bh))
 1063                 __mark_dirty(bh);
 1064 }
 1065 
 1066 void mark_buffer_dirty(struct buffer_head *bh)
 1067 {
 1068         if (!atomic_set_buffer_dirty(bh)) {
 1069                 __mark_dirty(bh);
 1070                 balance_dirty();
 1071         }
 1072 }
 1073 
 1074 void set_buffer_flushtime(struct buffer_head *bh)
 1075 {
 1076         bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
 1077 }
 1078 EXPORT_SYMBOL(set_buffer_flushtime);
 1079 
 1080 /*
 1081  * A buffer may need to be moved from one buffer list to another
 1082  * (e.g. in case it is not shared any more). Handle this.
 1083  */
 1084 static void __refile_buffer(struct buffer_head *bh)
 1085 {
 1086         int dispose = BUF_CLEAN;
 1087         if (buffer_locked(bh))
 1088                 dispose = BUF_LOCKED;
 1089         if (buffer_dirty(bh))
 1090                 dispose = BUF_DIRTY;
 1091         if (dispose != bh->b_list) {
 1092                 __remove_from_lru_list(bh);
 1093                 bh->b_list = dispose;
 1094                 if (dispose == BUF_CLEAN)
 1095                         remove_inode_queue(bh);
 1096                 __insert_into_lru_list(bh, dispose);
 1097         }
 1098 }
 1099 
 1100 void refile_buffer(struct buffer_head *bh)
 1101 {
 1102         spin_lock(&lru_list_lock);
 1103         __refile_buffer(bh);
 1104         spin_unlock(&lru_list_lock);
 1105 }
 1106 
 1107 /*
 1108  * Release a buffer head
 1109  */
 1110 void __brelse(struct buffer_head * buf)
 1111 {
 1112         if (atomic_read(&buf->b_count)) {
 1113                 put_bh(buf);
 1114                 return;
 1115         }
 1116         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
 1117 }
 1118 
 1119 /*
 1120  * bforget() is like brelse(), except it discards any
 1121  * potentially dirty data.
 1122  */
 1123 void __bforget(struct buffer_head * buf)
 1124 {
 1125         mark_buffer_clean(buf);
 1126         __brelse(buf);
 1127 }
 1128 
 1129 /**
 1130  *      bread() - reads a specified block and returns the bh
 1131  *      @block: number of block
 1132  *      @size: size (in bytes) to read
 1133  * 
 1134  *      Reads a specified block, and returns buffer head that
 1135  *      contains it. It returns NULL if the block was unreadable.
 1136  */
 1137 struct buffer_head * bread(kdev_t dev, int block, int size)
 1138 {
 1139         struct buffer_head * bh;
 1140 
 1141         bh = getblk(dev, block, size);
 1142         if (buffer_uptodate(bh))
 1143                 return bh;
 1144         set_bit(BH_Sync, &bh->b_state);
 1145         ll_rw_block(READ, 1, &bh);
 1146         wait_on_buffer(bh);
 1147         if (buffer_uptodate(bh))
 1148                 return bh;
 1149         brelse(bh);
 1150         return NULL;
 1151 }
 1152 
 1153 /*
 1154  * Note: the caller should wake up the buffer_wait list if needed.
 1155  */
 1156 static void __put_unused_buffer_head(struct buffer_head * bh)
 1157 {
 1158         if (unlikely(buffer_attached(bh)))
 1159                 BUG();
 1160         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 1161                 kmem_cache_free(bh_cachep, bh);
 1162         } else {
 1163                 bh->b_dev = B_FREE;
 1164                 bh->b_blocknr = -1;
 1165                 bh->b_this_page = NULL;
 1166 
 1167                 nr_unused_buffer_heads++;
 1168                 bh->b_next_free = unused_list;
 1169                 unused_list = bh;
 1170         }
 1171 }
 1172 
 1173 void put_unused_buffer_head(struct buffer_head *bh)
 1174 {
 1175         spin_lock(&unused_list_lock);
 1176         __put_unused_buffer_head(bh);
 1177         spin_unlock(&unused_list_lock);
 1178 }
 1179 EXPORT_SYMBOL(put_unused_buffer_head);
 1180 
 1181 /*
 1182  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
 1183  * no-buffer-head deadlock.  Return NULL on failure; waiting for
 1184  * buffer heads is now handled in create_buffers().
 1185  */ 
 1186 struct buffer_head * get_unused_buffer_head(int async)
 1187 {
 1188         struct buffer_head * bh;
 1189 
 1190         spin_lock(&unused_list_lock);
 1191         if (nr_unused_buffer_heads > NR_RESERVED) {
 1192                 bh = unused_list;
 1193                 unused_list = bh->b_next_free;
 1194                 nr_unused_buffer_heads--;
 1195                 spin_unlock(&unused_list_lock);
 1196                 return bh;
 1197         }
 1198         spin_unlock(&unused_list_lock);
 1199 
 1200         /* This is critical.  We can't call out to the FS
 1201          * to get more buffer heads, because the FS may need
 1202          * more buffer-heads itself.  Thus SLAB_NOFS.
 1203          */
 1204         if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
 1205                 bh->b_blocknr = -1;
 1206                 bh->b_this_page = NULL;
 1207                 return bh;
 1208         }
 1209 
 1210         /*
 1211          * If we need an async buffer, use the reserved buffer heads.
 1212          */
 1213         if (async) {
 1214                 spin_lock(&unused_list_lock);
 1215                 if (unused_list) {
 1216                         bh = unused_list;
 1217                         unused_list = bh->b_next_free;
 1218                         nr_unused_buffer_heads--;
 1219                         spin_unlock(&unused_list_lock);
 1220                         return bh;
 1221                 }
 1222                 spin_unlock(&unused_list_lock);
 1223         }
 1224 
 1225         return NULL;
 1226 }
 1227 EXPORT_SYMBOL(get_unused_buffer_head);
 1228 
 1229 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
 1230 {
 1231         if (offset >= PAGE_SIZE)
 1232                 BUG();
 1233 
 1234         if (PageHighMem(page)) {
 1235                 bh->b_data = (char *)offset;
 1236         } else {
 1237                 bh->b_data = page_address(page) + offset;
 1238         }
 1239         bh->b_page = page;
 1240 }
 1241 EXPORT_SYMBOL(set_bh_page);
 1242 
 1243 /*
 1244  * Create the appropriate buffers when given a page for data area and
 1245  * the size of each buffer.. Use the bh->b_this_page linked list to
 1246  * follow the buffers created.  Return NULL if unable to create more
 1247  * buffers.
 1248  * The async flag is used to differentiate async IO (paging, swapping)
 1249  * from ordinary buffer allocations, and only async requests are allowed
 1250  * to sleep waiting for buffer heads. 
 1251  */
 1252 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
 1253 {
 1254         struct buffer_head *bh, *head;
 1255         long offset;
 1256 
 1257 try_again:
 1258         head = NULL;
 1259         offset = PAGE_SIZE;
 1260         while ((offset -= size) >= 0) {
 1261                 bh = get_unused_buffer_head(async);
 1262                 if (!bh)
 1263                         goto no_grow;
 1264 
 1265                 bh->b_dev = NODEV;
 1266                 bh->b_this_page = head;
 1267                 head = bh;
 1268 
 1269                 bh->b_state = 0;
 1270                 bh->b_next_free = NULL;
 1271                 bh->b_pprev = NULL;
 1272                 atomic_set(&bh->b_count, 0);
 1273                 bh->b_size = size;
 1274 
 1275                 set_bh_page(bh, page, offset);
 1276 
 1277                 bh->b_list = BUF_CLEAN;
 1278                 bh->b_end_io = NULL;
 1279         }
 1280         return head;
 1281 /*
 1282  * In case anything failed, we just free everything we got.
 1283  */
 1284 no_grow:
 1285         if (head) {
 1286                 spin_lock(&unused_list_lock);
 1287                 do {
 1288                         bh = head;
 1289                         head = head->b_this_page;
 1290                         __put_unused_buffer_head(bh);
 1291                 } while (head);
 1292                 spin_unlock(&unused_list_lock);
 1293 
 1294                 /* Wake up any waiters ... */
 1295                 wake_up(&buffer_wait);
 1296         }
 1297 
 1298         /*
 1299          * Return failure for non-async IO requests.  Async IO requests
 1300          * are not allowed to fail, so we have to wait until buffer heads
 1301          * become available.  But we don't want tasks sleeping with 
 1302          * partially complete buffers, so all were released above.
 1303          */
 1304         if (!async)
 1305                 return NULL;
 1306 
 1307         /* We're _really_ low on memory. Now we just
 1308          * wait for old buffer heads to become free due to
 1309          * finishing IO.  Since this is an async request and
 1310          * the reserve list is empty, we're sure there are 
 1311          * async buffer heads in use.
 1312          */
 1313         run_task_queue(&tq_disk);
 1314 
 1315         free_more_memory();
 1316         goto try_again;
 1317 }
 1318 
 1319 /*
 1320  * Called when truncating a buffer on a page completely.
 1321  */
 1322 static void discard_buffer(struct buffer_head * bh)
 1323 {
 1324         if (buffer_mapped(bh)) {
 1325                 mark_buffer_clean(bh);
 1326                 lock_buffer(bh);
 1327                 clear_bit(BH_Uptodate, &bh->b_state);
 1328                 clear_bit(BH_Mapped, &bh->b_state);
 1329                 clear_bit(BH_Req, &bh->b_state);
 1330                 clear_bit(BH_New, &bh->b_state);
 1331                 remove_from_queues(bh);
 1332                 unlock_buffer(bh);
 1333         }
 1334 }
 1335 
 1336 /**
 1337  * try_to_release_page - release old fs-specific metadata on a page
 1338  *
 1339  */
 1340 
 1341 int try_to_release_page(struct page * page, int gfp_mask)
 1342 {
 1343         if (!PageLocked(page))
 1344                 BUG();
 1345         
 1346         if (!page->mapping)
 1347                 goto try_to_free;
 1348         if (!page->mapping->a_ops->releasepage)
 1349                 goto try_to_free;
 1350         if (page->mapping->a_ops->releasepage(page, gfp_mask))
 1351                 goto try_to_free;
 1352         /*
 1353          * We couldn't release buffer metadata; don't even bother trying
 1354          * to release buffers.
 1355          */
 1356         return 0;
 1357 try_to_free:    
 1358         return try_to_free_buffers(page, gfp_mask);
 1359 }
 1360 
 1361 /*
 1362  * We don't have to release all buffers here, but
 1363  * we have to be sure that no dirty buffer is left
 1364  * and no IO is going on (no buffer is locked), because
 1365  * we have truncated the file and are going to free the
 1366  * blocks on-disk..
 1367  */
 1368 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
 1369 {
 1370         struct buffer_head *head, *bh, *next;
 1371         unsigned int curr_off = 0;
 1372 
 1373         if (!PageLocked(page))
 1374                 BUG();
 1375         if (!page->buffers)
 1376                 return 1;
 1377 
 1378         head = page->buffers;
 1379         bh = head;
 1380         do {
 1381                 unsigned int next_off = curr_off + bh->b_size;
 1382                 next = bh->b_this_page;
 1383 
 1384                 /*
 1385                  * is this block fully flushed?
 1386                  */
 1387                 if (offset <= curr_off)
 1388                         discard_buffer(bh);
 1389                 curr_off = next_off;
 1390                 bh = next;
 1391         } while (bh != head);
 1392 
 1393         /*
 1394          * subtle. We release buffer-heads only if this is
 1395          * the 'final' flushpage. We have invalidated the get_block
 1396          * cached value unconditionally, so real IO is not
 1397          * possible anymore.
 1398          *
 1399          * If the free doesn't work out, the buffers can be
 1400          * left around - they just turn into anonymous buffers
 1401          * instead.
 1402          */
 1403         if (!offset) {
 1404                 if (!try_to_release_page(page, 0))
 1405                         return 0;
 1406         }
 1407 
 1408         return 1;
 1409 }
 1410 
 1411 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
 1412 {
 1413         struct buffer_head *bh, *head, *tail;
 1414 
 1415         /* FIXME: create_buffers should fail if there's no enough memory */
 1416         head = create_buffers(page, blocksize, 1);
 1417         if (page->buffers)
 1418                 BUG();
 1419 
 1420         bh = head;
 1421         do {
 1422                 bh->b_dev = dev;
 1423                 bh->b_blocknr = 0;
 1424                 bh->b_end_io = NULL;
 1425                 tail = bh;
 1426                 bh = bh->b_this_page;
 1427         } while (bh);
 1428         tail->b_this_page = head;
 1429         page->buffers = head;
 1430         page_cache_get(page);
 1431 }
 1432 EXPORT_SYMBOL(create_empty_buffers);
 1433 
 1434 /*
 1435  * We are taking a block for data and we don't want any output from any
 1436  * buffer-cache aliases starting from return from that function and
 1437  * until the moment when something will explicitly mark the buffer
 1438  * dirty (hopefully that will not happen until we will free that block ;-)
 1439  * We don't even need to mark it not-uptodate - nobody can expect
 1440  * anything from a newly allocated buffer anyway. We used to used
 1441  * unmap_buffer() for such invalidation, but that was wrong. We definitely
 1442  * don't want to mark the alias unmapped, for example - it would confuse
 1443  * anyone who might pick it with bread() afterwards...
 1444  */
 1445 
 1446 static void unmap_underlying_metadata(struct buffer_head * bh)
 1447 {
 1448         struct buffer_head *old_bh;
 1449 
 1450         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
 1451         if (old_bh) {
 1452                 mark_buffer_clean(old_bh);
 1453                 wait_on_buffer(old_bh);
 1454                 clear_bit(BH_Req, &old_bh->b_state);
 1455                 __brelse(old_bh);
 1456         }
 1457 }
 1458 
 1459 /*
 1460  * NOTE! All mapped/uptodate combinations are valid:
 1461  *
 1462  *      Mapped  Uptodate        Meaning
 1463  *
 1464  *      No      No              "unknown" - must do get_block()
 1465  *      No      Yes             "hole" - zero-filled
 1466  *      Yes     No              "allocated" - allocated on disk, not read in
 1467  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
 1468  *
 1469  * "Dirty" is valid only with the last case (mapped+uptodate).
 1470  */
 1471 
 1472 /*
 1473  * block_write_full_page() is SMP threaded - the kernel lock is not held.
 1474  */
 1475 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
 1476 {
 1477         int err, i;
 1478         unsigned long block;
 1479         struct buffer_head *bh, *head;
 1480         int need_unlock;
 1481 
 1482         if (!PageLocked(page))
 1483                 BUG();
 1484 
 1485         if (!page->buffers)
 1486                 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
 1487         head = page->buffers;
 1488 
 1489         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 1490 
 1491         bh = head;
 1492         i = 0;
 1493 
 1494         /* Stage 1: make sure we have all the buffers mapped! */
 1495         do {
 1496                 /*
 1497                  * If the buffer isn't up-to-date, we can't be sure
 1498                  * that the buffer has been initialized with the proper
 1499                  * block number information etc..
 1500                  *
 1501                  * Leave it to the low-level FS to make all those
 1502                  * decisions (block #0 may actually be a valid block)
 1503                  */
 1504                 if (!buffer_mapped(bh)) {
 1505                         err = get_block(inode, block, bh, 1);
 1506                         if (err)
 1507                                 goto out;
 1508                         if (buffer_new(bh))
 1509                                 unmap_underlying_metadata(bh);
 1510                 }
 1511                 bh = bh->b_this_page;
 1512                 block++;
 1513         } while (bh != head);
 1514 
 1515         /* Stage 2: lock the buffers, mark them clean */
 1516         do {
 1517                 lock_buffer(bh);
 1518                 set_buffer_async_io(bh);
 1519                 set_bit(BH_Uptodate, &bh->b_state);
 1520                 clear_bit(BH_Dirty, &bh->b_state);
 1521                 bh = bh->b_this_page;
 1522         } while (bh != head);
 1523 
 1524         /* Stage 3: submit the IO */
 1525         do {
 1526                 struct buffer_head *next = bh->b_this_page;
 1527                 submit_bh(WRITE, bh);
 1528                 bh = next;
 1529         } while (bh != head);
 1530 
 1531         /* Done - end_buffer_io_async will unlock */
 1532         SetPageUptodate(page);
 1533 
 1534         wakeup_page_waiters(page);
 1535 
 1536         return 0;
 1537 
 1538 out:
 1539         /*
 1540          * ENOSPC, or some other error.  We may already have added some
 1541          * blocks to the file, so we need to write these out to avoid
 1542          * exposing stale data.
 1543          */
 1544         ClearPageUptodate(page);
 1545         bh = head;
 1546         need_unlock = 1;
 1547         /* Recovery: lock and submit the mapped buffers */
 1548         do {
 1549                 if (buffer_mapped(bh)) {
 1550                         lock_buffer(bh);
 1551                         set_buffer_async_io(bh);
 1552                         need_unlock = 0;
 1553                 }
 1554                 bh = bh->b_this_page;
 1555         } while (bh != head);
 1556         do {
 1557                 struct buffer_head *next = bh->b_this_page;
 1558                 if (buffer_mapped(bh)) {
 1559                         set_bit(BH_Uptodate, &bh->b_state);
 1560                         clear_bit(BH_Dirty, &bh->b_state);
 1561                         submit_bh(WRITE, bh);
 1562                 }
 1563                 bh = next;
 1564         } while (bh != head);
 1565         if (need_unlock)
 1566                 UnlockPage(page);
 1567         wakeup_page_waiters(page);
 1568         return err;
 1569 }
 1570 
 1571 static int __block_prepare_write(struct inode *inode, struct page *page,
 1572                 unsigned from, unsigned to, get_block_t *get_block)
 1573 {
 1574         unsigned block_start, block_end;
 1575         unsigned long block;
 1576         int err = 0;
 1577         unsigned blocksize, bbits;
 1578         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 1579         char *kaddr = kmap(page);
 1580 
 1581         blocksize = 1 << inode->i_blkbits;
 1582         if (!page->buffers)
 1583                 create_empty_buffers(page, inode->i_dev, blocksize);
 1584         head = page->buffers;
 1585 
 1586         bbits = inode->i_blkbits;
 1587         block = page->index << (PAGE_CACHE_SHIFT - bbits);
 1588 
 1589         for(bh = head, block_start = 0; bh != head || !block_start;
 1590             block++, block_start=block_end, bh = bh->b_this_page) {
 1591                 if (!bh)
 1592                         BUG();
 1593                 block_end = block_start+blocksize;
 1594                 if (block_end <= from)
 1595                         continue;
 1596                 if (block_start >= to)
 1597                         break;
 1598                 clear_bit(BH_New, &bh->b_state);
 1599                 if (!buffer_mapped(bh)) {
 1600                         err = get_block(inode, block, bh, 1);
 1601                         if (err)
 1602                                 goto out;
 1603                         if (buffer_new(bh)) {
 1604                                 unmap_underlying_metadata(bh);
 1605                                 if (Page_Uptodate(page)) {
 1606                                         set_bit(BH_Uptodate, &bh->b_state);
 1607                                         continue;
 1608                                 }
 1609                                 if (block_end > to)
 1610                                         memset(kaddr+to, 0, block_end-to);
 1611                                 if (block_start < from)
 1612                                         memset(kaddr+block_start, 0, from-block_start);
 1613                                 if (block_end > to || block_start < from)
 1614                                         flush_dcache_page(page);
 1615                                 continue;
 1616                         }
 1617                 }
 1618                 if (Page_Uptodate(page)) {
 1619                         set_bit(BH_Uptodate, &bh->b_state);
 1620                         continue; 
 1621                 }
 1622                 if (!buffer_uptodate(bh) &&
 1623                      (block_start < from || block_end > to)) {
 1624                         ll_rw_block(READ, 1, &bh);
 1625                         *wait_bh++=bh;
 1626                 }
 1627         }
 1628         /*
 1629          * If we issued read requests - let them complete.
 1630          */
 1631         while(wait_bh > wait) {
 1632                 wait_on_buffer(*--wait_bh);
 1633                 if (!buffer_uptodate(*wait_bh))
 1634                         return -EIO;
 1635         }
 1636         return 0;
 1637 out:
 1638         /*
 1639          * Zero out any newly allocated blocks to avoid exposing stale
 1640          * data.  If BH_New is set, we know that the block was newly
 1641          * allocated in the above loop.
 1642          *
 1643          * Details the buffer can be new and uptodate because:
 1644          * 1) hole in uptodate page, get_block(create) allocate the block,
 1645          *    so the buffer is new and additionally we also mark it uptodate
 1646          * 2) The buffer is not mapped and uptodate due a previous partial read.
 1647          *
 1648          * We can always ignore uptodate buffers here, if you mark a buffer
 1649          * uptodate you must make sure it contains the right data first.
 1650          *
 1651          * We must stop the "undo/clear" fixup pass not at the caller "to"
 1652          * but at the last block that we successfully arrived in the main loop.
 1653          */
 1654         bh = head;
 1655         to = block_start; /* stop at the last successfully handled block */
 1656         block_start = 0;
 1657         do {
 1658                 block_end = block_start+blocksize;
 1659                 if (block_end <= from)
 1660                         goto next_bh;
 1661                 if (block_start >= to)
 1662                         break;
 1663                 if (buffer_new(bh) && !buffer_uptodate(bh)) {
 1664                         memset(kaddr+block_start, 0, bh->b_size);
 1665                         flush_dcache_page(page);
 1666                         set_bit(BH_Uptodate, &bh->b_state);
 1667                         mark_buffer_dirty(bh);
 1668                 }
 1669 next_bh:
 1670                 block_start = block_end;
 1671                 bh = bh->b_this_page;
 1672         } while (bh != head);
 1673         return err;
 1674 }
 1675 
 1676 static int __block_commit_write(struct inode *inode, struct page *page,
 1677                 unsigned from, unsigned to)
 1678 {
 1679         unsigned block_start, block_end;
 1680         int partial = 0, need_balance_dirty = 0;
 1681         unsigned blocksize;
 1682         struct buffer_head *bh, *head;
 1683 
 1684         blocksize = 1 << inode->i_blkbits;
 1685 
 1686         for(bh = head = page->buffers, block_start = 0;
 1687             bh != head || !block_start;
 1688             block_start=block_end, bh = bh->b_this_page) {
 1689                 block_end = block_start + blocksize;
 1690                 if (block_end <= from || block_start >= to) {
 1691                         if (!buffer_uptodate(bh))
 1692                                 partial = 1;
 1693                 } else {
 1694                         set_bit(BH_Uptodate, &bh->b_state);
 1695                         if (!atomic_set_buffer_dirty(bh)) {
 1696                                 __mark_dirty(bh);
 1697                                 buffer_insert_inode_data_queue(bh, inode);
 1698                                 need_balance_dirty = 1;
 1699                         }
 1700                 }
 1701         }
 1702 
 1703         if (need_balance_dirty)
 1704                 balance_dirty();
 1705         /*
 1706          * is this a partial write that happened to make all buffers
 1707          * uptodate then we can optimize away a bogus readpage() for
 1708          * the next read(). Here we 'discover' wether the page went
 1709          * uptodate as a result of this (potentially partial) write.
 1710          */
 1711         if (!partial)
 1712                 SetPageUptodate(page);
 1713         return 0;
 1714 }
 1715 
 1716 /*
 1717  * Generic "read page" function for block devices that have the normal
 1718  * get_block functionality. This is most of the block device filesystems.
 1719  * Reads the page asynchronously --- the unlock_buffer() and
 1720  * mark_buffer_uptodate() functions propagate buffer state into the
 1721  * page struct once IO has completed.
 1722  */
 1723 int block_read_full_page(struct page *page, get_block_t *get_block)
 1724 {
 1725         struct inode *inode = page->mapping->host;
 1726         unsigned long iblock, lblock;
 1727         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 1728         unsigned int blocksize, blocks;
 1729         int nr, i;
 1730 
 1731         if (!PageLocked(page))
 1732                 PAGE_BUG(page);
 1733         blocksize = 1 << inode->i_blkbits;
 1734         if (!page->buffers)
 1735                 create_empty_buffers(page, inode->i_dev, blocksize);
 1736         head = page->buffers;
 1737 
 1738         blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
 1739         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 1740         lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
 1741         bh = head;
 1742         nr = 0;
 1743         i = 0;
 1744 
 1745         do {
 1746                 if (buffer_uptodate(bh))
 1747                         continue;
 1748 
 1749                 if (!buffer_mapped(bh)) {
 1750                         if (iblock < lblock) {
 1751                                 if (get_block(inode, iblock, bh, 0))
 1752                                         continue;
 1753                         }
 1754                         if (!buffer_mapped(bh)) {
 1755                                 memset(kmap(page) + i*blocksize, 0, blocksize);
 1756                                 flush_dcache_page(page);
 1757                                 kunmap(page);
 1758                                 set_bit(BH_Uptodate, &bh->b_state);
 1759                                 continue;
 1760                         }
 1761                         /* get_block() might have updated the buffer synchronously */
 1762                         if (buffer_uptodate(bh))
 1763                                 continue;
 1764                 }
 1765 
 1766                 arr[nr] = bh;
 1767                 nr++;
 1768         } while (i++, iblock++, (bh = bh->b_this_page) != head);
 1769 
 1770         if (!nr) {
 1771                 /*
 1772                  * all buffers are uptodate - we can set the page
 1773                  * uptodate as well.
 1774                  */
 1775                 SetPageUptodate(page);
 1776                 UnlockPage(page);
 1777                 return 0;
 1778         }
 1779 
 1780         /* Stage two: lock the buffers */
 1781         for (i = 0; i < nr; i++) {
 1782                 struct buffer_head * bh = arr[i];
 1783                 lock_buffer(bh);
 1784                 set_buffer_async_io(bh);
 1785         }
 1786 
 1787         /* Stage 3: start the IO */
 1788         for (i = 0; i < nr; i++) {
 1789                 struct buffer_head * bh = arr[i];
 1790                 if (buffer_uptodate(bh))
 1791                         end_buffer_io_async(bh, 1);
 1792                 else
 1793                         submit_bh(READ, bh);
 1794         }
 1795 
 1796         wakeup_page_waiters(page);
 1797         
 1798         return 0;
 1799 }
 1800 
 1801 /* utility function for filesystems that need to do work on expanding
 1802  * truncates.  Uses prepare/commit_write to allow the filesystem to
 1803  * deal with the hole.  
 1804  */
 1805 int generic_cont_expand(struct inode *inode, loff_t size)
 1806 {
 1807         struct address_space *mapping = inode->i_mapping;
 1808         struct page *page;
 1809         unsigned long index, offset, limit;
 1810         int err;
 1811 
 1812         err = -EFBIG;
 1813         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 1814         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
 1815                 send_sig(SIGXFSZ, current, 0);
 1816                 goto out;
 1817         }
 1818         if (size > inode->i_sb->s_maxbytes)
 1819                 goto out;
 1820 
 1821         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
 1822 
 1823         /* ugh.  in prepare/commit_write, if from==to==start of block, we 
 1824         ** skip the prepare.  make sure we never send an offset for the start
 1825         ** of a block
 1826         */
 1827         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
 1828                 offset++;
 1829         }
 1830         index = size >> PAGE_CACHE_SHIFT;
 1831         err = -ENOMEM;
 1832         page = grab_cache_page(mapping, index);
 1833         if (!page)
 1834                 goto out;
 1835         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
 1836         if (!err) {
 1837                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
 1838         }
 1839         UnlockPage(page);
 1840         page_cache_release(page);
 1841         if (err > 0)
 1842                 err = 0;
 1843 out:
 1844         return err;
 1845 }
 1846 
 1847 /*
 1848  * For moronic filesystems that do not allow holes in file.
 1849  * We may have to extend the file.
 1850  */
 1851 
 1852 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
 1853 {
 1854         struct address_space *mapping = page->mapping;
 1855         struct inode *inode = mapping->host;
 1856         struct page *new_page;
 1857         unsigned long pgpos;
 1858         long status;
 1859         unsigned zerofrom;
 1860         unsigned blocksize = 1 << inode->i_blkbits;
 1861         char *kaddr;
 1862 
 1863         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
 1864                 status = -ENOMEM;
 1865                 new_page = grab_cache_page(mapping, pgpos);
 1866                 if (!new_page)
 1867                         goto out;
 1868                 /* we might sleep */
 1869                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
 1870                         UnlockPage(new_page);
 1871                         page_cache_release(new_page);
 1872                         continue;
 1873                 }
 1874                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
 1875                 if (zerofrom & (blocksize-1)) {
 1876                         *bytes |= (blocksize-1);
 1877                         (*bytes)++;
 1878                 }
 1879                 status = __block_prepare_write(inode, new_page, zerofrom,
 1880                                                 PAGE_CACHE_SIZE, get_block);
 1881                 if (status)
 1882                         goto out_unmap;
 1883                 kaddr = page_address(new_page);
 1884                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
 1885                 flush_dcache_page(new_page);
 1886                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
 1887                 kunmap(new_page);
 1888                 UnlockPage(new_page);
 1889                 page_cache_release(new_page);
 1890         }
 1891 
 1892         if (page->index < pgpos) {
 1893                 /* completely inside the area */
 1894                 zerofrom = offset;
 1895         } else {
 1896                 /* page covers the boundary, find the boundary offset */
 1897                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
 1898 
 1899                 /* if we will expand the thing last block will be filled */
 1900                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
 1901                         *bytes |= (blocksize-1);
 1902                         (*bytes)++;
 1903                 }
 1904 
 1905                 /* starting below the boundary? Nothing to zero out */
 1906                 if (offset <= zerofrom)
 1907                         zerofrom = offset;
 1908         }
 1909         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
 1910         if (status)
 1911                 goto out1;
 1912         kaddr = page_address(page);
 1913         if (zerofrom < offset) {
 1914                 memset(kaddr+zerofrom, 0, offset-zerofrom);
 1915                 flush_dcache_page(page);
 1916                 __block_commit_write(inode, page, zerofrom, offset);
 1917         }
 1918         return 0;
 1919 out1:
 1920         ClearPageUptodate(page);
 1921         kunmap(page);
 1922         return status;
 1923 
 1924 out_unmap:
 1925         ClearPageUptodate(new_page);
 1926         kunmap(new_page);
 1927         UnlockPage(new_page);
 1928         page_cache_release(new_page);
 1929 out:
 1930         return status;
 1931 }
 1932 
 1933 int block_prepare_write(struct page *page, unsigned from, unsigned to,
 1934                         get_block_t *get_block)
 1935 {
 1936         struct inode *inode = page->mapping->host;
 1937         int err = __block_prepare_write(inode, page, from, to, get_block);
 1938         if (err) {
 1939                 ClearPageUptodate(page);
 1940                 kunmap(page);
 1941         }
 1942         return err;
 1943 }
 1944 
 1945 int block_commit_write(struct page *page, unsigned from, unsigned to)
 1946 {
 1947         struct inode *inode = page->mapping->host;
 1948         __block_commit_write(inode,page,from,to);
 1949         kunmap(page);
 1950         return 0;
 1951 }
 1952 
 1953 int generic_commit_write(struct file *file, struct page *page,
 1954                 unsigned from, unsigned to)
 1955 {
 1956         struct inode *inode = page->mapping->host;
 1957         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 1958         __block_commit_write(inode,page,from,to);
 1959         kunmap(page);
 1960         if (pos > inode->i_size) {
 1961                 inode->i_size = pos;
 1962                 mark_inode_dirty(inode);
 1963         }
 1964         return 0;
 1965 }
 1966 
 1967 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
 1968 {
 1969         unsigned long index = from >> PAGE_CACHE_SHIFT;
 1970         unsigned offset = from & (PAGE_CACHE_SIZE-1);
 1971         unsigned blocksize, iblock, length, pos;
 1972         struct inode *inode = mapping->host;
 1973         struct page *page;
 1974         struct buffer_head *bh;
 1975         int err;
 1976 
 1977         blocksize = 1 << inode->i_blkbits;
 1978         length = offset & (blocksize - 1);
 1979 
 1980         /* Block boundary? Nothing to do */
 1981         if (!length)
 1982                 return 0;
 1983 
 1984         length = blocksize - length;
 1985         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 1986         
 1987         page = grab_cache_page(mapping, index);
 1988         err = -ENOMEM;
 1989         if (!page)
 1990                 goto out;
 1991 
 1992         if (!page->buffers)
 1993                 create_empty_buffers(page, inode->i_dev, blocksize);
 1994 
 1995         /* Find the buffer that contains "offset" */
 1996         bh = page->buffers;
 1997         pos = blocksize;
 1998         while (offset >= pos) {
 1999                 bh = bh->b_this_page;
 2000                 iblock++;
 2001                 pos += blocksize;
 2002         }
 2003 
 2004         err = 0;
 2005         if (!buffer_mapped(bh)) {
 2006                 /* Hole? Nothing to do */
 2007                 if (buffer_uptodate(bh))
 2008                         goto unlock;
 2009                 get_block(inode, iblock, bh, 0);
 2010                 /* Still unmapped? Nothing to do */
 2011                 if (!buffer_mapped(bh))
 2012                         goto unlock;
 2013         }
 2014 
 2015         /* Ok, it's mapped. Make sure it's up-to-date */
 2016         if (Page_Uptodate(page))
 2017                 set_bit(BH_Uptodate, &bh->b_state);
 2018 
 2019         if (!buffer_uptodate(bh)) {
 2020                 err = -EIO;
 2021                 ll_rw_block(READ, 1, &bh);
 2022                 wait_on_buffer(bh);
 2023                 /* Uhhuh. Read error. Complain and punt. */
 2024                 if (!buffer_uptodate(bh))
 2025                         goto unlock;
 2026         }
 2027 
 2028         memset(kmap(page) + offset, 0, length);
 2029         flush_dcache_page(page);
 2030         kunmap(page);
 2031 
 2032         if (!atomic_set_buffer_dirty(bh)) {
 2033                 __mark_dirty(bh);
 2034                 buffer_insert_inode_data_queue(bh, inode);
 2035                 balance_dirty();
 2036         }
 2037 
 2038         err = 0;
 2039 
 2040 unlock:
 2041         UnlockPage(page);
 2042         page_cache_release(page);
 2043 out:
 2044         return err;
 2045 }
 2046 
 2047 int block_write_full_page(struct page *page, get_block_t *get_block)
 2048 {
 2049         struct inode *inode = page->mapping->host;
 2050         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 2051         unsigned offset;
 2052         int err;
 2053 
 2054         /* easy case */
 2055         if (page->index < end_index)
 2056                 return __block_write_full_page(inode, page, get_block);
 2057 
 2058         /* things got complicated... */
 2059         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
 2060         /* OK, are we completely out? */
 2061         if (page->index >= end_index+1 || !offset) {
 2062                 UnlockPage(page);
 2063                 return -EIO;
 2064         }
 2065 
 2066         /* Sigh... will have to work, then... */
 2067         err = __block_prepare_write(inode, page, 0, offset, get_block);
 2068         if (!err) {
 2069                 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
 2070                 flush_dcache_page(page);
 2071                 __block_commit_write(inode,page,0,offset);
 2072 done:
 2073                 kunmap(page);
 2074                 UnlockPage(page);
 2075                 return err;
 2076         }
 2077         ClearPageUptodate(page);
 2078         goto done;
 2079 }
 2080 
 2081 /*
 2082  * Commence writeout of all the buffers against a page.  The
 2083  * page must be locked.   Returns zero on success or a negative
 2084  * errno.
 2085  */
 2086 int writeout_one_page(struct page *page)
 2087 {
 2088         struct buffer_head *bh, *head = page->buffers;
 2089 
 2090         if (!PageLocked(page))
 2091                 BUG();
 2092         bh = head;
 2093         do {
 2094                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 2095                         continue;
 2096 
 2097                 bh->b_flushtime = jiffies;
 2098                 ll_rw_block(WRITE, 1, &bh);     
 2099         } while ((bh = bh->b_this_page) != head);
 2100         return 0;
 2101 }
 2102 EXPORT_SYMBOL(writeout_one_page);
 2103 
 2104 /*
 2105  * Wait for completion of I/O of all buffers against a page.  The page
 2106  * must be locked.  Returns zero on success or a negative errno.
 2107  */
 2108 int waitfor_one_page(struct page *page)
 2109 {
 2110         int error = 0;
 2111         struct buffer_head *bh, *head = page->buffers;
 2112 
 2113         bh = head;
 2114         do {
 2115                 wait_on_buffer(bh);
 2116                 if (buffer_req(bh) && !buffer_uptodate(bh))
 2117                         error = -EIO;
 2118         } while ((bh = bh->b_this_page) != head);
 2119         return error;
 2120 }
 2121 EXPORT_SYMBOL(waitfor_one_page);
 2122 
 2123 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
 2124 {
 2125         struct buffer_head tmp;
 2126         struct inode *inode = mapping->host;
 2127         tmp.b_state = 0;
 2128         tmp.b_blocknr = 0;
 2129         get_block(inode, block, &tmp, 0);
 2130         return tmp.b_blocknr;
 2131 }
 2132 
 2133 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
 2134 {
 2135         int i, nr_blocks, retval;
 2136         unsigned long * blocks = iobuf->blocks;
 2137         int length;
 2138         int beyond_eof = 0;
 2139         
 2140         length = iobuf->length;
 2141         nr_blocks = length / blocksize;
 2142         /* build the blocklist */
 2143         for (i = 0; i < nr_blocks; i++, blocknr++) {
 2144                 struct buffer_head bh;
 2145 
 2146                 bh.b_state = 0;
 2147                 bh.b_dev = inode->i_dev;
 2148                 bh.b_size = blocksize;
 2149                 bh.b_page = NULL;
 2150 
 2151                 if (((loff_t) blocknr) * blocksize >= inode->i_size)
 2152                         beyond_eof = 1;
 2153 
 2154                 /* Only allow get_block to create new blocks if we are safely
 2155                    beyond EOF.  O_DIRECT is unsafe inside sparse files. */
 2156                 retval = get_block(inode, blocknr, &bh, 
 2157                                    ((rw != READ) && beyond_eof));
 2158 
 2159                 if (retval) {
 2160                         if (!i)
 2161                                 /* report error to userspace */
 2162                                 goto out;
 2163                         else
 2164                                 /* do short I/O until 'i' */
 2165                                 break;
 2166                 }
 2167 
 2168                 if (rw == READ) {
 2169                         if (buffer_new(&bh))
 2170                                 BUG();
 2171                         if (!buffer_mapped(&bh)) {
 2172                                 /* there was an hole in the filesystem */
 2173                                 blocks[i] = -1UL;
 2174                                 continue;
 2175                         }
 2176                 } else {
 2177                         if (buffer_new(&bh))
 2178                                 unmap_underlying_metadata(&bh);
 2179                         if (!buffer_mapped(&bh))
 2180                                 /* upper layers need to pass the error on or
 2181                                  * fall back to buffered IO. */
 2182                                 return -ENOTBLK;
 2183                 }
 2184                 blocks[i] = bh.b_blocknr;
 2185         }
 2186 
 2187         /* patch length to handle short I/O */
 2188         iobuf->length = i * blocksize;
 2189         if (!beyond_eof)
 2190                 up(&inode->i_sem);
 2191         retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
 2192         if (!beyond_eof)
 2193                 down(&inode->i_sem);
 2194         /* restore orig length */
 2195         iobuf->length = length;
 2196  out:
 2197 
 2198         return retval;
 2199 }
 2200 
 2201 /*
 2202  * IO completion routine for a buffer_head being used for kiobuf IO: we
 2203  * can't dispatch the kiobuf callback until io_count reaches 0.  
 2204  */
 2205 
 2206 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
 2207 {
 2208         struct kiobuf *kiobuf;
 2209         
 2210         mark_buffer_uptodate(bh, uptodate);
 2211 
 2212         kiobuf = bh->b_private;
 2213         unlock_buffer(bh);
 2214         end_kio_request(kiobuf, uptodate);
 2215 }
 2216 
 2217 /*
 2218  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
 2219  * for them to complete.  Clean up the buffer_heads afterwards.  
 2220  */
 2221 
 2222 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
 2223 {
 2224         int iosize, err;
 2225         int i;
 2226         struct buffer_head *tmp;
 2227 
 2228         iosize = 0;
 2229         err = 0;
 2230 
 2231         for (i = nr; --i >= 0; ) {
 2232                 iosize += size;
 2233                 tmp = bh[i];
 2234                 wait_on_buffer(tmp);
 2235                 
 2236                 if (!buffer_uptodate(tmp)) {
 2237                         /* We are traversing bh'es in reverse order so
 2238                            clearing iosize on error calculates the
 2239                            amount of IO before the first error. */
 2240                         iosize = 0;
 2241                         err = -EIO;
 2242                 }
 2243         }
 2244         
 2245         if (iosize)
 2246                 return iosize;
 2247         return err;
 2248 }
 2249 
 2250 /*
 2251  * Start I/O on a physical range of kernel memory, defined by a vector
 2252  * of kiobuf structs (much like a user-space iovec list).
 2253  *
 2254  * The kiobuf must already be locked for IO.  IO is submitted
 2255  * asynchronously: you need to check page->locked and page->uptodate.
 2256  *
 2257  * It is up to the caller to make sure that there are enough blocks
 2258  * passed in to completely map the iobufs to disk.
 2259  */
 2260 
 2261 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
 2262                kdev_t dev, unsigned long b[], int size)
 2263 {
 2264         int             err;
 2265         int             length;
 2266         int             transferred;
 2267         int             i;
 2268         int             bufind;
 2269         int             pageind;
 2270         int             bhind;
 2271         int             offset;
 2272         unsigned long   blocknr;
 2273         struct kiobuf * iobuf = NULL;
 2274         struct page *   map;
 2275         struct buffer_head *tmp, **bhs = NULL;
 2276 
 2277         if (!nr)
 2278                 return 0;
 2279         
 2280         /* 
 2281          * First, do some alignment and validity checks 
 2282          */
 2283         for (i = 0; i < nr; i++) {
 2284                 iobuf = iovec[i];
 2285                 if ((iobuf->offset & (size-1)) ||
 2286                     (iobuf->length & (size-1)))
 2287                         return -EINVAL;
 2288                 if (!iobuf->nr_pages)
 2289                         panic("brw_kiovec: iobuf not initialised");
 2290         }
 2291 
 2292         /* 
 2293          * OK to walk down the iovec doing page IO on each page we find. 
 2294          */
 2295         bufind = bhind = transferred = err = 0;
 2296         for (i = 0; i < nr; i++) {
 2297                 iobuf = iovec[i];
 2298                 offset = iobuf->offset;
 2299                 length = iobuf->length;
 2300                 iobuf->errno = 0;
 2301                 if (!bhs)
 2302                         bhs = iobuf->bh;
 2303                 
 2304                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
 2305                         map  = iobuf->maplist[pageind];
 2306                         if (!map) {
 2307                                 err = -EFAULT;
 2308                                 goto finished;
 2309                         }
 2310                         
 2311                         while (length > 0) {
 2312                                 blocknr = b[bufind++];
 2313                                 if (blocknr == -1UL) {
 2314                                         if (rw == READ) {
 2315                                                 /* there was an hole in the filesystem */
 2316                                                 memset(kmap(map) + offset, 0, size);
 2317                                                 flush_dcache_page(map);
 2318                                                 kunmap(map);
 2319 
 2320                                                 transferred += size;
 2321                                                 goto skip_block;
 2322                                         } else
 2323                                                 BUG();
 2324                                 }
 2325                                 tmp = bhs[bhind++];
 2326 
 2327                                 tmp->b_size = size;
 2328                                 set_bh_page(tmp, map, offset);
 2329                                 tmp->b_this_page = tmp;
 2330 
 2331                                 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
 2332                                 tmp->b_dev = dev;
 2333                                 tmp->b_blocknr = blocknr;
 2334                                 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
 2335 
 2336                                 if (rw == WRITE) {
 2337                                         set_bit(BH_Uptodate, &tmp->b_state);
 2338                                         clear_bit(BH_Dirty, &tmp->b_state);
 2339                                 } else
 2340                                         set_bit(BH_Uptodate, &tmp->b_state);
 2341 
 2342                                 atomic_inc(&iobuf->io_count);
 2343                                 submit_bh(rw, tmp);
 2344                                 /* 
 2345                                  * Wait for IO if we have got too much 
 2346                                  */
 2347                                 if (bhind >= KIO_MAX_SECTORS) {
 2348                                         kiobuf_wait_for_io(iobuf); /* wake-one */
 2349                                         err = wait_kio(rw, bhind, bhs, size);
 2350                                         if (err >= 0)
 2351                                                 transferred += err;
 2352                                         else
 2353                                                 goto finished;
 2354                                         bhind = 0;
 2355                                 }
 2356 
 2357                         skip_block:
 2358                                 length -= size;
 2359                                 offset += size;
 2360 
 2361                                 if (offset >= PAGE_SIZE) {
 2362                                         offset = 0;
 2363                                         break;
 2364                                 }
 2365                         } /* End of block loop */
 2366                 } /* End of page loop */                
 2367         } /* End of iovec loop */
 2368 
 2369         /* Is there any IO still left to submit? */
 2370         if (bhind) {
 2371                 kiobuf_wait_for_io(iobuf); /* wake-one */
 2372                 err = wait_kio(rw, bhind, bhs, size);
 2373                 if (err >= 0)
 2374                         transferred += err;
 2375                 else
 2376                         goto finished;
 2377         }
 2378 
 2379  finished:
 2380         if (transferred)
 2381                 return transferred;
 2382         return err;
 2383 }
 2384 
 2385 /*
 2386  * Start I/O on a page.
 2387  * This function expects the page to be locked and may return
 2388  * before I/O is complete. You then have to check page->locked
 2389  * and page->uptodate.
 2390  *
 2391  * brw_page() is SMP-safe, although it's being called with the
 2392  * kernel lock held - but the code is ready.
 2393  *
 2394  * FIXME: we need a swapper_inode->get_block function to remove
 2395  *        some of the bmap kludges and interface ugliness here.
 2396  */
 2397 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
 2398 {
 2399         struct buffer_head *head, *bh;
 2400 
 2401         if (!PageLocked(page))
 2402                 panic("brw_page: page not locked for I/O");
 2403 
 2404         if (!page->buffers)
 2405                 create_empty_buffers(page, dev, size);
 2406         head = bh = page->buffers;
 2407 
 2408         /* Stage 1: lock all the buffers */
 2409         do {
 2410                 lock_buffer(bh);
 2411                 bh->b_blocknr = *(b++);
 2412                 set_bit(BH_Mapped, &bh->b_state);
 2413                 set_buffer_async_io(bh);
 2414                 bh = bh->b_this_page;
 2415         } while (bh != head);
 2416 
 2417         /* Stage 2: start the IO */
 2418         do {
 2419                 struct buffer_head *next = bh->b_this_page;
 2420                 submit_bh(rw, bh);
 2421                 bh = next;
 2422         } while (bh != head);
 2423         wakeup_page_waiters(page);
 2424         return 0;
 2425 }
 2426 
 2427 int block_symlink(struct inode *inode, const char *symname, int len)
 2428 {
 2429         struct address_space *mapping = inode->i_mapping;
 2430         struct page *page = grab_cache_page(mapping, 0);
 2431         int err = -ENOMEM;
 2432         char *kaddr;
 2433 
 2434         if (!page)
 2435                 goto fail;
 2436         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
 2437         if (err)
 2438                 goto fail_map;
 2439         kaddr = page_address(page);
 2440         memcpy(kaddr, symname, len-1);
 2441         mapping->a_ops->commit_write(NULL, page, 0, len-1);
 2442         /*
 2443          * Notice that we are _not_ going to block here - end of page is
 2444          * unmapped, so this will only try to map the rest of page, see
 2445          * that it is unmapped (typically even will not look into inode -
 2446          * ->i_size will be enough for everything) and zero it out.
 2447          * OTOH it's obviously correct and should make the page up-to-date.
 2448          */
 2449         err = mapping->a_ops->readpage(NULL, page);
 2450         wait_on_page(page);
 2451         page_cache_release(page);
 2452         if (err < 0)
 2453                 goto fail;
 2454         mark_inode_dirty(inode);
 2455         return 0;
 2456 fail_map:
 2457         UnlockPage(page);
 2458         page_cache_release(page);
 2459 fail:
 2460         return err;
 2461 }
 2462 
 2463 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
 2464 {
 2465         struct buffer_head *bh, *tail;
 2466 
 2467         bh = head;
 2468         do {
 2469                 tail = bh;
 2470                 bh = bh->b_this_page;
 2471         } while (bh);
 2472         tail->b_this_page = head;
 2473         page->buffers = head;
 2474         page_cache_get(page);
 2475 }
 2476 
 2477 /*
 2478  * Create the page-cache page that contains the requested block
 2479  */
 2480 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
 2481 {
 2482         struct page * page;
 2483         struct buffer_head *bh;
 2484 
 2485         page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
 2486         if (!page)
 2487                 return NULL;
 2488 
 2489         if (!PageLocked(page))
 2490                 BUG();
 2491 
 2492         bh = page->buffers;
 2493         if (bh) {
 2494                 if (bh->b_size == size)
 2495                         return page;
 2496                 if (!try_to_free_buffers(page, GFP_NOFS))
 2497                         goto failed;
 2498         }
 2499 
 2500         bh = create_buffers(page, size, 0);
 2501         if (!bh)
 2502                 goto failed;
 2503         link_dev_buffers(page, bh);
 2504         return page;
 2505 
 2506 failed:
 2507         UnlockPage(page);
 2508         page_cache_release(page);
 2509         return NULL;
 2510 }
 2511 
 2512 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
 2513 {
 2514         struct buffer_head *head = page->buffers;
 2515         struct buffer_head *bh = head;
 2516         unsigned int uptodate;
 2517 
 2518         uptodate = 1 << BH_Mapped;
 2519         if (Page_Uptodate(page))
 2520                 uptodate |= 1 << BH_Uptodate;
 2521 
 2522         write_lock(&hash_table_lock);
 2523         do {
 2524                 if (!(bh->b_state & (1 << BH_Mapped))) {
 2525                         init_buffer(bh, NULL, NULL);
 2526                         bh->b_dev = dev;
 2527                         bh->b_blocknr = block;
 2528                         bh->b_state = uptodate;
 2529                 }
 2530 
 2531                 /* Insert the buffer into the hash lists if necessary */
 2532                 if (!bh->b_pprev)
 2533                         __insert_into_hash_list(bh);
 2534 
 2535                 block++;
 2536                 bh = bh->b_this_page;
 2537         } while (bh != head);
 2538         write_unlock(&hash_table_lock);
 2539 }
 2540 
 2541 /*
 2542  * Try to increase the number of buffers available: the size argument
 2543  * is used to determine what kind of buffers we want.
 2544  */
 2545 static int grow_buffers(kdev_t dev, unsigned long block, int size)
 2546 {
 2547         struct page * page;
 2548         struct block_device *bdev;
 2549         unsigned long index;
 2550         int sizebits;
 2551 
 2552         /* Size must be multiple of hard sectorsize */
 2553         if (size & (get_hardsect_size(dev)-1))
 2554                 BUG();
 2555         /* Size must be within 512 bytes and PAGE_SIZE */
 2556         if (size < 512 || size > PAGE_SIZE)
 2557                 BUG();
 2558 
 2559         sizebits = -1;
 2560         do {
 2561                 sizebits++;
 2562         } while ((size << sizebits) < PAGE_SIZE);
 2563 
 2564         index = block >> sizebits;
 2565         block = index << sizebits;
 2566 
 2567         bdev = bdget(kdev_t_to_nr(dev));
 2568         if (!bdev) {
 2569                 printk("No block device for %s\n", kdevname(dev));
 2570                 BUG();
 2571         }
 2572 
 2573         /* Create a page with the proper size buffers.. */
 2574         page = grow_dev_page(bdev, index, size);
 2575 
 2576         /* This is "wrong" - talk to Al Viro */
 2577         atomic_dec(&bdev->bd_count);
 2578         if (!page)
 2579                 return 0;
 2580 
 2581         /* Hash in the buffers on the hash list */
 2582         hash_page_buffers(page, dev, block, size);
 2583         UnlockPage(page);
 2584         page_cache_release(page);
 2585 
 2586         /* We hashed up this page, so increment buffermem */
 2587         atomic_inc(&buffermem_pages);
 2588         return 1;
 2589 }
 2590 
 2591 /*
 2592  * The first time the VM inspects a page which has locked buffers, it
 2593  * will just mark it as needing waiting upon on the scan of the page LRU.
 2594  * BH_Wait_IO is used for this.
 2595  *
 2596  * The second time the VM visits the page, if it still has locked
 2597  * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
 2598  *
 2599  * The third time the VM visits the page, if the I/O hasn't completed
 2600  * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
 2601  * used for this.
 2602  *
 2603  * There is also the case of buffers which were locked by someone else
 2604  * - write(2) callers, bdflush, etc.  There can be a huge number of these
 2605  * and we don't want to just skip them all and fail the page allocation. 
 2606  * We want to be able to wait on these buffers as well.
 2607  *
 2608  * The BH_Launder bit is set in submit_bh() to indicate that I/O is
 2609  * underway against the buffer, doesn't matter who started it - we know
 2610  * that the buffer will eventually come unlocked, and so it's safe to
 2611  * wait on it.
 2612  *
 2613  * The caller holds the page lock and the caller will free this page
 2614  * into current->local_page, so by waiting on the page's buffers the
 2615  * caller is guaranteed to obtain this page.
 2616  *
 2617  * sync_page_buffers() will sort-of return true if all the buffers
 2618  * against this page are freeable, so try_to_free_buffers() should
 2619  * try to free the page's buffers a second time.  This is a bit
 2620  * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
 2621  */
 2622 static int sync_page_buffers(struct buffer_head *head)
 2623 {
 2624         struct buffer_head * bh = head;
 2625         int tryagain = 1;
 2626 
 2627         do {
 2628                 if (!buffer_dirty(bh) && !buffer_locked(bh))
 2629                         continue;
 2630 
 2631                 /* Don't start IO first time around.. */
 2632                 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
 2633                         tryagain = 0;
 2634                         continue;
 2635                 }
 2636 
 2637                 /* Second time through we start actively writing out.. */
 2638                 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
 2639                         if (unlikely(!buffer_launder(bh))) {
 2640                                 tryagain = 0;
 2641                                 continue;
 2642                         }
 2643                         wait_on_buffer(bh);
 2644                         tryagain = 1;
 2645                         continue;
 2646                 }
 2647 
 2648                 if (!atomic_set_buffer_clean(bh)) {
 2649                         unlock_buffer(bh);
 2650                         continue;
 2651                 }
 2652 
 2653                 __mark_buffer_clean(bh);
 2654                 get_bh(bh);
 2655                 bh->b_end_io = end_buffer_io_sync;
 2656                 submit_bh(WRITE, bh);
 2657                 tryagain = 0;
 2658         } while ((bh = bh->b_this_page) != head);
 2659 
 2660         return tryagain;
 2661 }
 2662 
 2663 /*
 2664  * Can the buffer be thrown out?
 2665  */
 2666 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
 2667 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
 2668 
 2669 /*
 2670  * try_to_free_buffers() checks if all the buffers on this particular page
 2671  * are unused, and free's the page if so.
 2672  *
 2673  * Wake up bdflush() if this fails - if we're running low on memory due
 2674  * to dirty buffers, we need to flush them out as quickly as possible.
 2675  *
 2676  * NOTE: There are quite a number of ways that threads of control can
 2677  *       obtain a reference to a buffer head within a page.  So we must
 2678  *       lock out all of these paths to cleanly toss the page.
 2679  */
 2680 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
 2681 {
 2682         struct buffer_head * tmp, * bh = page->buffers;
 2683 
 2684 cleaned_buffers_try_again:
 2685         spin_lock(&lru_list_lock);
 2686         write_lock(&hash_table_lock);
 2687         tmp = bh;
 2688         do {
 2689                 if (buffer_busy(tmp))
 2690                         goto busy_buffer_page;
 2691                 tmp = tmp->b_this_page;
 2692         } while (tmp != bh);
 2693 
 2694         spin_lock(&unused_list_lock);
 2695         tmp = bh;
 2696 
 2697         /* if this buffer was hashed, this page counts as buffermem */
 2698         if (bh->b_pprev)
 2699                 atomic_dec(&buffermem_pages);
 2700         do {
 2701                 struct buffer_head * p = tmp;
 2702                 tmp = tmp->b_this_page;
 2703 
 2704                 if (p->b_dev == B_FREE) BUG();
 2705 
 2706                 remove_inode_queue(p);
 2707                 __remove_from_queues(p);
 2708                 __put_unused_buffer_head(p);
 2709         } while (tmp != bh);
 2710         spin_unlock(&unused_list_lock);
 2711 
 2712         /* Wake up anyone waiting for buffer heads */
 2713         wake_up(&buffer_wait);
 2714 
 2715         /* And free the page */
 2716         page->buffers = NULL;
 2717         page_cache_release(page);
 2718         write_unlock(&hash_table_lock);
 2719         spin_unlock(&lru_list_lock);
 2720         return 1;
 2721 
 2722 busy_buffer_page:
 2723         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
 2724         write_unlock(&hash_table_lock);
 2725         spin_unlock(&lru_list_lock);
 2726         gfp_mask = pf_gfp_mask(gfp_mask);
 2727         if (gfp_mask & __GFP_IO) {
 2728                 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
 2729                         if (sync_page_buffers(bh)) {
 2730                                 /* no IO or waiting next time */
 2731                                 gfp_mask = 0;
 2732                                 goto cleaned_buffers_try_again;
 2733                         }
 2734                 }
 2735         }
 2736         if (balance_dirty_state() >= 0)
 2737                 wakeup_bdflush();
 2738         return 0;
 2739 }
 2740 EXPORT_SYMBOL(try_to_free_buffers);
 2741 
 2742 /* ================== Debugging =================== */
 2743 
 2744 void show_buffers(void)
 2745 {
 2746 #ifdef CONFIG_SMP
 2747         struct buffer_head * bh;
 2748         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
 2749         int nlist;
 2750         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
 2751 #endif
 2752 
 2753         printk("Buffer memory:   %6dkB\n",
 2754                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
 2755 
 2756         printk("Cache memory:   %6dkB\n",
 2757                         (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
 2758 
 2759 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
 2760         if (!spin_trylock(&lru_list_lock))
 2761                 return;
 2762         for(nlist = 0; nlist < NR_LIST; nlist++) {
 2763                 found = locked = dirty = used = lastused = 0;
 2764                 bh = lru_list[nlist];
 2765                 if(!bh) continue;
 2766 
 2767                 do {
 2768                         found++;
 2769                         if (buffer_locked(bh))
 2770                                 locked++;
 2771                         if (buffer_dirty(bh))
 2772                                 dirty++;
 2773                         if (atomic_read(&bh->b_count))
 2774                                 used++, lastused = found;
 2775                         bh = bh->b_next_free;
 2776                 } while (bh != lru_list[nlist]);
 2777                 {
 2778                         int tmp = nr_buffers_type[nlist];
 2779                         if (found != tmp)
 2780                                 printk("%9s: BUG -> found %d, reported %d\n",
 2781                                        buf_types[nlist], found, tmp);
 2782                 }
 2783                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
 2784                        "%d locked, %d dirty\n",
 2785                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
 2786                        used, lastused, locked, dirty);
 2787         }
 2788         spin_unlock(&lru_list_lock);
 2789 #endif
 2790 }
 2791 
 2792 /* ===================== Init ======================= */
 2793 
 2794 /*
 2795  * allocate the hash table and init the free list
 2796  * Use gfp() for the hash table to decrease TLB misses, use
 2797  * SLAB cache for buffer heads.
 2798  */
 2799 void __init buffer_init(unsigned long mempages)
 2800 {
 2801         int order, i;
 2802         unsigned int nr_hash;
 2803 
 2804         /* The buffer cache hash table is less important these days,
 2805          * trim it a bit.
 2806          */
 2807         mempages >>= 14;
 2808 
 2809         mempages *= sizeof(struct buffer_head *);
 2810 
 2811         for (order = 0; (1 << order) < mempages; order++)
 2812                 ;
 2813 
 2814         /* try to allocate something until we get it or we're asking
 2815            for something that is really too small */
 2816 
 2817         do {
 2818                 unsigned long tmp;
 2819 
 2820                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
 2821                 bh_hash_mask = (nr_hash - 1);
 2822 
 2823                 tmp = nr_hash;
 2824                 bh_hash_shift = 0;
 2825                 while((tmp >>= 1UL) != 0UL)
 2826                         bh_hash_shift++;
 2827 
 2828                 hash_table = (struct buffer_head **)
 2829                     __get_free_pages(GFP_ATOMIC, order);
 2830         } while (hash_table == NULL && --order > 0);
 2831         printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
 2832                nr_hash, order, (PAGE_SIZE << order));
 2833 
 2834         if (!hash_table)
 2835                 panic("Failed to allocate buffer hash table\n");
 2836 
 2837         /* Setup hash chains. */
 2838         for(i = 0; i < nr_hash; i++)
 2839                 hash_table[i] = NULL;
 2840 
 2841         /* Setup lru lists. */
 2842         for(i = 0; i < NR_LIST; i++)
 2843                 lru_list[i] = NULL;
 2844 
 2845 }
 2846 
 2847 
 2848 /* ====================== bdflush support =================== */
 2849 
 2850 /* This is a simple kernel daemon, whose job it is to provide a dynamic
 2851  * response to dirty buffers.  Once this process is activated, we write back
 2852  * a limited number of buffers to the disks and then go back to sleep again.
 2853  */
 2854 
 2855 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
 2856 
 2857 void wakeup_bdflush(void)
 2858 {
 2859         wake_up_interruptible(&bdflush_wait);
 2860 }
 2861 
 2862 /* 
 2863  * Here we attempt to write back old buffers.  We also try to flush inodes 
 2864  * and supers as well, since this function is essentially "update", and 
 2865  * otherwise there would be no way of ensuring that these quantities ever 
 2866  * get written back.  Ideally, we would have a timestamp on the inodes
 2867  * and superblocks so that we could write back only the old ones as well
 2868  */
 2869 
 2870 static int sync_old_buffers(void)
 2871 {
 2872         lock_kernel();
 2873         sync_unlocked_inodes();
 2874         sync_supers(0, 0);
 2875         unlock_kernel();
 2876 
 2877         for (;;) {
 2878                 struct buffer_head *bh;
 2879 
 2880                 spin_lock(&lru_list_lock);
 2881                 bh = lru_list[BUF_DIRTY];
 2882                 if (!bh || time_before(jiffies, bh->b_flushtime))
 2883                         break;
 2884                 if (write_some_buffers(NODEV))
 2885                         continue;
 2886                 return 0;
 2887         }
 2888         spin_unlock(&lru_list_lock);
 2889         return 0;
 2890 }
 2891 
 2892 int block_sync_page(struct page *page)
 2893 {
 2894         run_task_queue(&tq_disk);
 2895         return 0;
 2896 }
 2897 
 2898 /* This is the interface to bdflush.  As we get more sophisticated, we can
 2899  * pass tuning parameters to this "process", to adjust how it behaves. 
 2900  * We would want to verify each parameter, however, to make sure that it 
 2901  * is reasonable. */
 2902 
 2903 asmlinkage long sys_bdflush(int func, long data)
 2904 {
 2905         if (!capable(CAP_SYS_ADMIN))
 2906                 return -EPERM;
 2907 
 2908         if (func == 1) {
 2909                 /* do_exit directly and let kupdate to do its work alone. */
 2910                 do_exit(0);
 2911 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
 2912          a syscall that doesn't care about the current mm context. */
 2913                 int error;
 2914                 struct mm_struct *user_mm;
 2915 
 2916                 /*
 2917                  * bdflush will spend all of it's time in kernel-space,
 2918                  * without touching user-space, so we can switch it into
 2919                  * 'lazy TLB mode' to reduce the cost of context-switches
 2920                  * to and from bdflush.
 2921                  */
 2922                 user_mm = start_lazy_tlb();
 2923                 error = sync_old_buffers();
 2924                 end_lazy_tlb(user_mm);
 2925                 return error;
 2926 #endif
 2927         }
 2928 
 2929         /* Basically func 1 means read param 1, 2 means write param 1, etc */
 2930         if (func >= 2) {
 2931                 int i = (func-2) >> 1;
 2932                 if (i >= 0 && i < N_PARAM) {
 2933                         if ((func & 1) == 0)
 2934                                 return put_user(bdf_prm.data[i], (int*)data);
 2935 
 2936                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
 2937                                 bdf_prm.data[i] = data;
 2938                                 return 0;
 2939                         }
 2940                 }
 2941                 return -EINVAL;
 2942         }
 2943 
 2944         /* Having func 0 used to launch the actual bdflush and then never
 2945          * return (unless explicitly killed). We return zero here to 
 2946          * remain semi-compatible with present update(8) programs.
 2947          */
 2948         return 0;
 2949 }
 2950 
 2951 /*
 2952  * This is the actual bdflush daemon itself. It used to be started from
 2953  * the syscall above, but now we launch it ourselves internally with
 2954  * kernel_thread(...)  directly after the first thread in init/main.c
 2955  */
 2956 int bdflush(void *startup)
 2957 {
 2958         struct task_struct *tsk = current;
 2959 
 2960         /*
 2961          *      We have a bare-bones task_struct, and really should fill
 2962          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
 2963          *      display semi-sane things. Not real crucial though...  
 2964          */
 2965 
 2966         tsk->session = 1;
 2967         tsk->pgrp = 1;
 2968         strcpy(tsk->comm, "bdflush");
 2969 
 2970         /* avoid getting signals */
 2971         spin_lock_irq(&tsk->sigmask_lock);
 2972         flush_signals(tsk);
 2973         sigfillset(&tsk->blocked);
 2974         recalc_sigpending(tsk);
 2975         spin_unlock_irq(&tsk->sigmask_lock);
 2976 
 2977         complete((struct completion *)startup);
 2978 
 2979         /*
 2980          * FIXME: The ndirty logic here is wrong.  It's supposed to
 2981          * send bdflush back to sleep after writing ndirty buffers.
 2982          * In fact, the test is wrong so bdflush will in fact
 2983          * sleep when bdflush_stop() returns true.
 2984          *
 2985          * FIXME: If it proves useful to implement ndirty properly,
 2986          * then perhaps the value of ndirty should be scaled by the
 2987          * amount of memory in the machine.
 2988          */
 2989         for (;;) {
 2990                 int ndirty = bdf_prm.b_un.ndirty;
 2991 
 2992                 CHECK_EMERGENCY_SYNC
 2993 
 2994                 while (ndirty > 0) {
 2995                         spin_lock(&lru_list_lock);
 2996                         if (!write_some_buffers(NODEV))
 2997                                 break;
 2998                         ndirty -= NRSYNC;
 2999                 }
 3000                 if (ndirty > 0 || bdflush_stop())
 3001                         interruptible_sleep_on(&bdflush_wait);
 3002         }
 3003 }
 3004 
 3005 /*
 3006  * This is the kernel update daemon. It was used to live in userspace
 3007  * but since it's need to run safely we want it unkillable by mistake.
 3008  * You don't need to change your userspace configuration since
 3009  * the userspace `update` will do_exit(0) at the first sys_bdflush().
 3010  */
 3011 int kupdate(void *startup)
 3012 {
 3013         struct task_struct * tsk = current;
 3014         int interval;
 3015 
 3016         tsk->session = 1;
 3017         tsk->pgrp = 1;
 3018         strcpy(tsk->comm, "kupdated");
 3019 
 3020         /* sigstop and sigcont will stop and wakeup kupdate */
 3021         spin_lock_irq(&tsk->sigmask_lock);
 3022         sigfillset(&tsk->blocked);
 3023         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
 3024         recalc_sigpending(tsk);
 3025         spin_unlock_irq(&tsk->sigmask_lock);
 3026 
 3027         complete((struct completion *)startup);
 3028 
 3029         for (;;) {
 3030                 /* update interval */
 3031                 interval = bdf_prm.b_un.interval;
 3032                 if (interval) {
 3033                         tsk->state = TASK_INTERRUPTIBLE;
 3034                         schedule_timeout(interval);
 3035                 } else {
 3036                 stop_kupdate:
 3037                         tsk->state = TASK_STOPPED;
 3038                         schedule(); /* wait for SIGCONT */
 3039                 }
 3040                 /* check for sigstop */
 3041                 if (signal_pending(tsk)) {
 3042                         int stopped = 0;
 3043                         spin_lock_irq(&tsk->sigmask_lock);
 3044                         if (sigismember(&tsk->pending.signal, SIGSTOP)) {
 3045                                 sigdelset(&tsk->pending.signal, SIGSTOP);
 3046                                 stopped = 1;
 3047                         }
 3048                         recalc_sigpending(tsk);
 3049                         spin_unlock_irq(&tsk->sigmask_lock);
 3050                         if (stopped)
 3051                                 goto stop_kupdate;
 3052                 }
 3053 #ifdef DEBUG
 3054                 printk(KERN_DEBUG "kupdate() activated...\n");
 3055 #endif
 3056                 sync_old_buffers();
 3057                 run_task_queue(&tq_disk);
 3058         }
 3059 }
 3060 
 3061 static int __init bdflush_init(void)
 3062 {
 3063         static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
 3064 
 3065         kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 3066         wait_for_completion(&startup);
 3067         kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 3068         wait_for_completion(&startup);
 3069         return 0;
 3070 }
 3071 
 3072 module_init(bdflush_init)
 3073 

Cache object: e8e3a1a62c364fb0b8111c3e4287827f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.