inode.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  *  linux/fs/ext3/inode.c
    3  *
    4  * Copyright (C) 1992, 1993, 1994, 1995
    5  * Remy Card (card@masi.ibp.fr)
    6  * Laboratoire MASI - Institut Blaise Pascal
    7  * Universite Pierre et Marie Curie (Paris VI)
    8  *
    9  *  from
   10  *
   11  *  linux/fs/minix/inode.c
   12  *
   13  *  Copyright (C) 1991, 1992  Linus Torvalds
   14  *
   15  *  Goal-directed block allocation by Stephen Tweedie
   16  *      (sct@redhat.com), 1993, 1998
   17  *  Big-endian to little-endian byte-swapping/bitmaps by
   18  *        David S. Miller (davem@caip.rutgers.edu), 1995
   19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
   20  *      (jj@sunsite.ms.mff.cuni.cz)
   21  *
   22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
   23  */
   24 
   25 #include <linux/fs.h>
   26 #include <linux/sched.h>
   27 #include <linux/ext3_jbd.h>
   28 #include <linux/jbd.h>
   29 #include <linux/locks.h>
   30 #include <linux/smp_lock.h>
   31 #include <linux/highuid.h>
   32 #include <linux/quotaops.h>
   33 #include <linux/module.h>
   34 
   35 /*
   36  * SEARCH_FROM_ZERO forces each block allocation to search from the start
   37  * of the filesystem.  This is to force rapid reallocation of recently-freed
   38  * blocks.  The file fragmentation is horrendous.
   39  */
   40 #undef SEARCH_FROM_ZERO
   41 
   42 /* The ext3 forget function must perform a revoke if we are freeing data
   43  * which has been journaled.  Metadata (eg. indirect blocks) must be
   44  * revoked in all cases. 
   45  *
   46  * "bh" may be NULL: a metadata block may have been freed from memory
   47  * but there may still be a record of it in the journal, and that record
   48  * still needs to be revoked.
   49  */
   50 
   51 static int ext3_forget(handle_t *handle, int is_metadata,
   52                        struct inode *inode, struct buffer_head *bh,
   53                        int blocknr)
   54 {
   55         int err;
   56 
   57         BUFFER_TRACE(bh, "enter");
   58 
   59         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
   60                   "data mode %lx\n",
   61                   bh, is_metadata, inode->i_mode,
   62                   test_opt(inode->i_sb, DATA_FLAGS));
   63         
   64         /* Never use the revoke function if we are doing full data
   65          * journaling: there is no need to, and a V1 superblock won't
   66          * support it.  Otherwise, only skip the revoke on un-journaled
   67          * data blocks. */
   68 
   69         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
   70             (!is_metadata && !ext3_should_journal_data(inode))) {
   71                 if (bh) {
   72                         BUFFER_TRACE(bh, "call journal_forget");
   73                         ext3_journal_forget(handle, bh);
   74                 }
   75                 return 0;
   76         }
   77 
   78         /*
   79          * data!=journal && (is_metadata || should_journal_data(inode))
   80          */
   81         BUFFER_TRACE(bh, "call ext3_journal_revoke");
   82         err = ext3_journal_revoke(handle, blocknr, bh);
   83         if (err)
   84                 ext3_abort(inode->i_sb, __FUNCTION__,
   85                            "error %d when attempting revoke", err);
   86         BUFFER_TRACE(bh, "exit");
   87         return err;
   88 }
   89 
   90 /*
   91  * Work out how many blocks we need to progress with the next chunk of a
   92  * truncate transaction.
   93  */
   94 
   95 static unsigned long blocks_for_truncate(struct inode *inode) 
   96 {
   97         unsigned long needed;
   98         
   99         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
  100 
  101         /* Give ourselves just enough room to cope with inodes in which
  102          * i_blocks is corrupt: we've seen disk corruptions in the past
  103          * which resulted in random data in an inode which looked enough
  104          * like a regular file for ext3 to try to delete it.  Things
  105          * will go a bit crazy if that happens, but at least we should
  106          * try not to panic the whole kernel. */
  107         if (needed < 2)
  108                 needed = 2;
  109 
  110         /* But we need to bound the transaction so we don't overflow the
  111          * journal. */
  112         if (needed > EXT3_MAX_TRANS_DATA) 
  113                 needed = EXT3_MAX_TRANS_DATA;
  114 
  115         return EXT3_DATA_TRANS_BLOCKS + needed;
  116 }
  117         
  118 /* 
  119  * Truncate transactions can be complex and absolutely huge.  So we need to
  120  * be able to restart the transaction at a conventient checkpoint to make
  121  * sure we don't overflow the journal.
  122  *
  123  * start_transaction gets us a new handle for a truncate transaction,
  124  * and extend_transaction tries to extend the existing one a bit.  If
  125  * extend fails, we need to propagate the failure up and restart the
  126  * transaction in the top-level truncate loop. --sct 
  127  */
  128 
  129 static handle_t *start_transaction(struct inode *inode) 
  130 {
  131         handle_t *result;
  132         
  133         result = ext3_journal_start(inode, blocks_for_truncate(inode));
  134         if (!IS_ERR(result))
  135                 return result;
  136         
  137         ext3_std_error(inode->i_sb, PTR_ERR(result));
  138         return result;
  139 }
  140 
  141 /*
  142  * Try to extend this transaction for the purposes of truncation.
  143  *
  144  * Returns 0 if we managed to create more room.  If we can't create more
  145  * room, and the transaction must be restarted we return 1.
  146  */
  147 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  148 {
  149         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
  150                 return 0;
  151         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
  152                 return 0;
  153         return 1;
  154 }
  155 
  156 /*
  157  * Restart the transaction associated with *handle.  This does a commit,
  158  * so before we call here everything must be consistently dirtied against
  159  * this transaction.
  160  */
  161 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
  162 {
  163         jbd_debug(2, "restarting handle %p\n", handle);
  164         return ext3_journal_restart(handle, blocks_for_truncate(inode));
  165 }
  166 
  167 /*
  168  * Called at each iput()
  169  */
  170 void ext3_put_inode (struct inode * inode)
  171 {
  172         ext3_discard_prealloc (inode);
  173 }
  174 
  175 /*
  176  * Called at the last iput() if i_nlink is zero.
  177  */
  178 void ext3_delete_inode (struct inode * inode)
  179 {
  180         handle_t *handle;
  181         
  182         if (is_bad_inode(inode) ||
  183             inode->i_ino == EXT3_ACL_IDX_INO ||
  184             inode->i_ino == EXT3_ACL_DATA_INO)
  185                 goto no_delete;
  186 
  187         lock_kernel();
  188         handle = start_transaction(inode);
  189         if (IS_ERR(handle)) {
  190                 /* If we're going to skip the normal cleanup, we still
  191                  * need to make sure that the in-core orphan linked list
  192                  * is properly cleaned up. */
  193                 ext3_orphan_del(NULL, inode);
  194 
  195                 ext3_std_error(inode->i_sb, PTR_ERR(handle));
  196                 unlock_kernel();
  197                 goto no_delete;
  198         }
  199         
  200         if (IS_SYNC(inode))
  201                 handle->h_sync = 1;
  202         inode->i_size = 0;
  203         if (inode->i_blocks)
  204                 ext3_truncate(inode);
  205         /*
  206          * Kill off the orphan record which ext3_truncate created.
  207          * AKPM: I think this can be inside the above `if'.
  208          * Note that ext3_orphan_del() has to be able to cope with the
  209          * deletion of a non-existent orphan - this is because we don't
  210          * know if ext3_truncate() actually created an orphan record.
  211          * (Well, we could do this if we need to, but heck - it works)
  212          */
  213         ext3_orphan_del(handle, inode);
  214         inode->u.ext3_i.i_dtime = CURRENT_TIME;
  215 
  216         /* 
  217          * One subtle ordering requirement: if anything has gone wrong
  218          * (transaction abort, IO errors, whatever), then we can still
  219          * do these next steps (the fs will already have been marked as
  220          * having errors), but we can't free the inode if the mark_dirty
  221          * fails.  
  222          */
  223         if (ext3_mark_inode_dirty(handle, inode))
  224                 /* If that failed, just do the required in-core inode clear. */
  225                 clear_inode(inode);
  226         else
  227                 ext3_free_inode(handle, inode);
  228         ext3_journal_stop(handle, inode);
  229         unlock_kernel();
  230         return;
  231 no_delete:
  232         clear_inode(inode);     /* We must guarantee clearing of inode... */
  233 }
  234 
  235 void ext3_discard_prealloc (struct inode * inode)
  236 {
  237 #ifdef EXT3_PREALLOCATE
  238         lock_kernel();
  239         /* Writer: ->i_prealloc* */
  240         if (inode->u.ext3_i.i_prealloc_count) {
  241                 unsigned short total = inode->u.ext3_i.i_prealloc_count;
  242                 unsigned long block = inode->u.ext3_i.i_prealloc_block;
  243                 inode->u.ext3_i.i_prealloc_count = 0;
  244                 inode->u.ext3_i.i_prealloc_block = 0;
  245                 /* Writer: end */
  246                 ext3_free_blocks (inode, block, total);
  247         }
  248         unlock_kernel();
  249 #endif
  250 }
  251 
  252 static int ext3_alloc_block (handle_t *handle,
  253                         struct inode * inode, unsigned long goal, int *err)
  254 {
  255 #ifdef EXT3FS_DEBUG
  256         static unsigned long alloc_hits = 0, alloc_attempts = 0;
  257 #endif
  258         unsigned long result;
  259 
  260 #ifdef EXT3_PREALLOCATE
  261         /* Writer: ->i_prealloc* */
  262         if (inode->u.ext3_i.i_prealloc_count &&
  263             (goal == inode->u.ext3_i.i_prealloc_block ||
  264              goal + 1 == inode->u.ext3_i.i_prealloc_block))
  265         {
  266                 result = inode->u.ext3_i.i_prealloc_block++;
  267                 inode->u.ext3_i.i_prealloc_count--;
  268                 /* Writer: end */
  269                 ext3_debug ("preallocation hit (%lu/%lu).\n",
  270                             ++alloc_hits, ++alloc_attempts);
  271         } else {
  272                 ext3_discard_prealloc (inode);
  273                 ext3_debug ("preallocation miss (%lu/%lu).\n",
  274                             alloc_hits, ++alloc_attempts);
  275                 if (S_ISREG(inode->i_mode))
  276                         result = ext3_new_block (inode, goal, 
  277                                  &inode->u.ext3_i.i_prealloc_count,
  278                                  &inode->u.ext3_i.i_prealloc_block, err);
  279                 else
  280                         result = ext3_new_block (inode, goal, 0, 0, err);
  281                 /*
  282                  * AKPM: this is somewhat sticky.  I'm not surprised it was
  283                  * disabled in 2.2's ext3.  Need to integrate b_committed_data
  284                  * guarding with preallocation, if indeed preallocation is
  285                  * effective.
  286                  */
  287         }
  288 #else
  289         result = ext3_new_block (handle, inode, goal, 0, 0, err);
  290 #endif
  291         return result;
  292 }
  293 
  294 
  295 typedef struct {
  296         u32     *p;
  297         u32     key;
  298         struct buffer_head *bh;
  299 } Indirect;
  300 
  301 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
  302 {
  303         p->key = *(p->p = v);
  304         p->bh = bh;
  305 }
  306 
  307 static inline int verify_chain(Indirect *from, Indirect *to)
  308 {
  309         while (from <= to && from->key == *from->p)
  310                 from++;
  311         return (from > to);
  312 }
  313 
  314 /**
  315  *      ext3_block_to_path - parse the block number into array of offsets
  316  *      @inode: inode in question (we are only interested in its superblock)
  317  *      @i_block: block number to be parsed
  318  *      @offsets: array to store the offsets in
  319  *
  320  *      To store the locations of file's data ext3 uses a data structure common
  321  *      for UNIX filesystems - tree of pointers anchored in the inode, with
  322  *      data blocks at leaves and indirect blocks in intermediate nodes.
  323  *      This function translates the block number into path in that tree -
  324  *      return value is the path length and @offsets[n] is the offset of
  325  *      pointer to (n+1)th node in the nth one. If @block is out of range
  326  *      (negative or too large) warning is printed and zero returned.
  327  *
  328  *      Note: function doesn't find node addresses, so no IO is needed. All
  329  *      we need to know is the capacity of indirect blocks (taken from the
  330  *      inode->i_sb).
  331  */
  332 
  333 /*
  334  * Portability note: the last comparison (check that we fit into triple
  335  * indirect block) is spelled differently, because otherwise on an
  336  * architecture with 32-bit longs and 8Kb pages we might get into trouble
  337  * if our filesystem had 8Kb blocks. We might use long long, but that would
  338  * kill us on x86. Oh, well, at least the sign propagation does not matter -
  339  * i_block would have to be negative in the very beginning, so we would not
  340  * get there at all.
  341  */
  342 
  343 static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
  344 {
  345         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
  346         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
  347         const long direct_blocks = EXT3_NDIR_BLOCKS,
  348                 indirect_blocks = ptrs,
  349                 double_blocks = (1 << (ptrs_bits * 2));
  350         int n = 0;
  351 
  352         if (i_block < 0) {
  353                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
  354         } else if (i_block < direct_blocks) {
  355                 offsets[n++] = i_block;
  356         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
  357                 offsets[n++] = EXT3_IND_BLOCK;
  358                 offsets[n++] = i_block;
  359         } else if ((i_block -= indirect_blocks) < double_blocks) {
  360                 offsets[n++] = EXT3_DIND_BLOCK;
  361                 offsets[n++] = i_block >> ptrs_bits;
  362                 offsets[n++] = i_block & (ptrs - 1);
  363         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
  364                 offsets[n++] = EXT3_TIND_BLOCK;
  365                 offsets[n++] = i_block >> (ptrs_bits * 2);
  366                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
  367                 offsets[n++] = i_block & (ptrs - 1);
  368         } else {
  369                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
  370         }
  371         return n;
  372 }
  373 
  374 /**
  375  *      ext3_get_branch - read the chain of indirect blocks leading to data
  376  *      @inode: inode in question
  377  *      @depth: depth of the chain (1 - direct pointer, etc.)
  378  *      @offsets: offsets of pointers in inode/indirect blocks
  379  *      @chain: place to store the result
  380  *      @err: here we store the error value
  381  *
  382  *      Function fills the array of triples <key, p, bh> and returns %NULL
  383  *      if everything went OK or the pointer to the last filled triple
  384  *      (incomplete one) otherwise. Upon the return chain[i].key contains
  385  *      the number of (i+1)-th block in the chain (as it is stored in memory,
  386  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
  387  *      number (it points into struct inode for i==0 and into the bh->b_data
  388  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
  389  *      block for i>0 and NULL for i==0. In other words, it holds the block
  390  *      numbers of the chain, addresses they were taken from (and where we can
  391  *      verify that chain did not change) and buffer_heads hosting these
  392  *      numbers.
  393  *
  394  *      Function stops when it stumbles upon zero pointer (absent block)
  395  *              (pointer to last triple returned, *@err == 0)
  396  *      or when it gets an IO error reading an indirect block
  397  *              (ditto, *@err == -EIO)
  398  *      or when it notices that chain had been changed while it was reading
  399  *              (ditto, *@err == -EAGAIN)
  400  *      or when it reads all @depth-1 indirect blocks successfully and finds
  401  *      the whole chain, all way to the data (returns %NULL, *err == 0).
  402  */
  403 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
  404                                  Indirect chain[4], int *err)
  405 {
  406         struct super_block *sb = inode->i_sb;
  407         Indirect *p = chain;
  408         struct buffer_head *bh;
  409 
  410         *err = 0;
  411         /* i_data is not going away, no lock needed */
  412         add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
  413         if (!p->key)
  414                 goto no_block;
  415         while (--depth) {
  416                 bh = sb_bread(sb, le32_to_cpu(p->key));
  417                 if (!bh)
  418                         goto failure;
  419                 /* Reader: pointers */
  420                 if (!verify_chain(chain, p))
  421                         goto changed;
  422                 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
  423                 /* Reader: end */
  424                 if (!p->key)
  425                         goto no_block;
  426         }
  427         return NULL;
  428 
  429 changed:
  430         brelse(bh);
  431         *err = -EAGAIN;
  432         goto no_block;
  433 failure:
  434         *err = -EIO;
  435 no_block:
  436         return p;
  437 }
  438 
  439 /**
  440  *      ext3_find_near - find a place for allocation with sufficient locality
  441  *      @inode: owner
  442  *      @ind: descriptor of indirect block.
  443  *
  444  *      This function returns the prefered place for block allocation.
  445  *      It is used when heuristic for sequential allocation fails.
  446  *      Rules are:
  447  *        + if there is a block to the left of our position - allocate near it.
  448  *        + if pointer will live in indirect block - allocate near that block.
  449  *        + if pointer will live in inode - allocate in the same
  450  *          cylinder group. 
  451  *      Caller must make sure that @ind is valid and will stay that way.
  452  */
  453 
  454 static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
  455 {
  456         u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
  457         u32 *p;
  458 
  459         /* Try to find previous block */
  460         for (p = ind->p - 1; p >= start; p--)
  461                 if (*p)
  462                         return le32_to_cpu(*p);
  463 
  464         /* No such thing, so let's try location of indirect block */
  465         if (ind->bh)
  466                 return ind->bh->b_blocknr;
  467 
  468         /*
  469          * It is going to be refered from inode itself? OK, just put it into
  470          * the same cylinder group then.
  471          */
  472         return (inode->u.ext3_i.i_block_group * 
  473                 EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
  474                le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
  475 }
  476 
  477 /**
  478  *      ext3_find_goal - find a prefered place for allocation.
  479  *      @inode: owner
  480  *      @block:  block we want
  481  *      @chain:  chain of indirect blocks
  482  *      @partial: pointer to the last triple within a chain
  483  *      @goal:  place to store the result.
  484  *
  485  *      Normally this function find the prefered place for block allocation,
  486  *      stores it in *@goal and returns zero. If the branch had been changed
  487  *      under us we return -EAGAIN.
  488  */
  489 
  490 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
  491                           Indirect *partial, unsigned long *goal)
  492 {
  493         /* Writer: ->i_next_alloc* */
  494         if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
  495                 inode->u.ext3_i.i_next_alloc_block++;
  496                 inode->u.ext3_i.i_next_alloc_goal++;
  497         }
  498 #ifdef SEARCH_FROM_ZERO
  499         inode->u.ext3_i.i_next_alloc_block = 0;
  500         inode->u.ext3_i.i_next_alloc_goal = 0;
  501 #endif
  502         /* Writer: end */
  503         /* Reader: pointers, ->i_next_alloc* */
  504         if (verify_chain(chain, partial)) {
  505                 /*
  506                  * try the heuristic for sequential allocation,
  507                  * failing that at least try to get decent locality.
  508                  */
  509                 if (block == inode->u.ext3_i.i_next_alloc_block)
  510                         *goal = inode->u.ext3_i.i_next_alloc_goal;
  511                 if (!*goal)
  512                         *goal = ext3_find_near(inode, partial);
  513 #ifdef SEARCH_FROM_ZERO
  514                 *goal = 0;
  515 #endif
  516                 return 0;
  517         }
  518         /* Reader: end */
  519         return -EAGAIN;
  520 }
  521 
  522 /**
  523  *      ext3_alloc_branch - allocate and set up a chain of blocks.
  524  *      @inode: owner
  525  *      @num: depth of the chain (number of blocks to allocate)
  526  *      @offsets: offsets (in the blocks) to store the pointers to next.
  527  *      @branch: place to store the chain in.
  528  *
  529  *      This function allocates @num blocks, zeroes out all but the last one,
  530  *      links them into chain and (if we are synchronous) writes them to disk.
  531  *      In other words, it prepares a branch that can be spliced onto the
  532  *      inode. It stores the information about that chain in the branch[], in
  533  *      the same format as ext3_get_branch() would do. We are calling it after
  534  *      we had read the existing part of chain and partial points to the last
  535  *      triple of that (one with zero ->key). Upon the exit we have the same
  536  *      picture as after the successful ext3_get_block(), excpet that in one
  537  *      place chain is disconnected - *branch->p is still zero (we did not
  538  *      set the last link), but branch->key contains the number that should
  539  *      be placed into *branch->p to fill that gap.
  540  *
  541  *      If allocation fails we free all blocks we've allocated (and forget
  542  *      their buffer_heads) and return the error value the from failed
  543  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
  544  *      as described above and return 0.
  545  */
  546 
  547 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
  548                              int num,
  549                              unsigned long goal,
  550                              int *offsets,
  551                              Indirect *branch)
  552 {
  553         int blocksize = inode->i_sb->s_blocksize;
  554         int n = 0, keys = 0;
  555         int err = 0;
  556         int i;
  557         int parent = ext3_alloc_block(handle, inode, goal, &err);
  558 
  559         branch[0].key = cpu_to_le32(parent);
  560         if (parent) {
  561                 for (n = 1; n < num; n++) {
  562                         struct buffer_head *bh;
  563                         /* Allocate the next block */
  564                         int nr = ext3_alloc_block(handle, inode, parent, &err);
  565                         if (!nr)
  566                                 break;
  567                         branch[n].key = cpu_to_le32(nr);
  568                         keys = n+1;
  569                         
  570                         /*
  571                          * Get buffer_head for parent block, zero it out
  572                          * and set the pointer to new one, then send
  573                          * parent to disk.  
  574                          */
  575                         bh = sb_getblk(inode->i_sb, parent);
  576                         branch[n].bh = bh;
  577                         lock_buffer(bh);
  578                         BUFFER_TRACE(bh, "call get_create_access");
  579                         err = ext3_journal_get_create_access(handle, bh);
  580                         if (err) {
  581                                 unlock_buffer(bh);
  582                                 brelse(bh);
  583                                 break;
  584                         }
  585 
  586                         memset(bh->b_data, 0, blocksize);
  587                         branch[n].p = (u32*) bh->b_data + offsets[n];
  588                         *branch[n].p = branch[n].key;
  589                         BUFFER_TRACE(bh, "marking uptodate");
  590                         mark_buffer_uptodate(bh, 1);
  591                         unlock_buffer(bh);
  592 
  593                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
  594                         err = ext3_journal_dirty_metadata(handle, bh);
  595                         if (err)
  596                                 break;
  597                         
  598                         parent = nr;
  599                 }
  600         }
  601         if (n == num)
  602                 return 0;
  603 
  604         /* Allocation failed, free what we already allocated */
  605         for (i = 1; i < keys; i++) {
  606                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
  607                 ext3_journal_forget(handle, branch[i].bh);
  608         }
  609         for (i = 0; i < keys; i++)
  610                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
  611         return err;
  612 }
  613 
  614 /**
  615  *      ext3_splice_branch - splice the allocated branch onto inode.
  616  *      @inode: owner
  617  *      @block: (logical) number of block we are adding
  618  *      @chain: chain of indirect blocks (with a missing link - see
  619  *              ext3_alloc_branch)
  620  *      @where: location of missing link
  621  *      @num:   number of blocks we are adding
  622  *
  623  *      This function verifies that chain (up to the missing link) had not
  624  *      changed, fills the missing link and does all housekeeping needed in
  625  *      inode (->i_blocks, etc.). In case of success we end up with the full
  626  *      chain to new block and return 0. Otherwise (== chain had been changed)
  627  *      we free the new blocks (forgetting their buffer_heads, indeed) and
  628  *      return -EAGAIN.
  629  */
  630 
  631 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
  632                               Indirect chain[4], Indirect *where, int num)
  633 {
  634         int i;
  635         int err = 0;
  636 
  637         /*
  638          * If we're splicing into a [td]indirect block (as opposed to the
  639          * inode) then we need to get write access to the [td]indirect block
  640          * before the splice.
  641          */
  642         if (where->bh) {
  643                 BUFFER_TRACE(where->bh, "get_write_access");
  644                 err = ext3_journal_get_write_access(handle, where->bh);
  645                 if (err)
  646                         goto err_out;
  647         }
  648         /* Verify that place we are splicing to is still there and vacant */
  649 
  650         /* Writer: pointers, ->i_next_alloc* */
  651         if (!verify_chain(chain, where-1) || *where->p)
  652                 /* Writer: end */
  653                 goto changed;
  654 
  655         /* That's it */
  656 
  657         *where->p = where->key;
  658         inode->u.ext3_i.i_next_alloc_block = block;
  659         inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
  660 #ifdef SEARCH_FROM_ZERO
  661         inode->u.ext3_i.i_next_alloc_block = 0;
  662         inode->u.ext3_i.i_next_alloc_goal = 0;
  663 #endif
  664         /* Writer: end */
  665 
  666         /* We are done with atomic stuff, now do the rest of housekeeping */
  667 
  668         inode->i_ctime = CURRENT_TIME;
  669         ext3_mark_inode_dirty(handle, inode);
  670 
  671         /* had we spliced it onto indirect block? */
  672         if (where->bh) {
  673                 /*
  674                  * akpm: If we spliced it onto an indirect block, we haven't
  675                  * altered the inode.  Note however that if it is being spliced
  676                  * onto an indirect block at the very end of the file (the
  677                  * file is growing) then we *will* alter the inode to reflect
  678                  * the new i_size.  But that is not done here - it is done in
  679                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
  680                  */
  681                 jbd_debug(5, "splicing indirect only\n");
  682                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
  683                 err = ext3_journal_dirty_metadata(handle, where->bh);
  684                 if (err) 
  685                         goto err_out;
  686         } else {
  687                 /*
  688                  * OK, we spliced it into the inode itself on a direct block.
  689                  * Inode was dirtied above.
  690                  */
  691                 jbd_debug(5, "splicing direct\n");
  692         }
  693         return err;
  694 
  695 changed:
  696         /*
  697          * AKPM: if where[i].bh isn't part of the current updating
  698          * transaction then we explode nastily.  Test this code path.
  699          */
  700         jbd_debug(1, "the chain changed: try again\n");
  701         err = -EAGAIN;
  702         
  703 err_out:
  704         for (i = 1; i < num; i++) {
  705                 BUFFER_TRACE(where[i].bh, "call journal_forget");
  706                 ext3_journal_forget(handle, where[i].bh);
  707         }
  708         /* For the normal collision cleanup case, we free up the blocks.
  709          * On genuine filesystem errors we don't even think about doing
  710          * that. */
  711         if (err == -EAGAIN)
  712                 for (i = 0; i < num; i++)
  713                         ext3_free_blocks(handle, inode, 
  714                                          le32_to_cpu(where[i].key), 1);
  715         return err;
  716 }
  717 
  718 /*
  719  * Allocation strategy is simple: if we have to allocate something, we will
  720  * have to go the whole way to leaf. So let's do it before attaching anything
  721  * to tree, set linkage between the newborn blocks, write them if sync is
  722  * required, recheck the path, free and repeat if check fails, otherwise
  723  * set the last missing link (that will protect us from any truncate-generated
  724  * removals - all blocks on the path are immune now) and possibly force the
  725  * write on the parent block.
  726  * That has a nice additional property: no special recovery from the failed
  727  * allocations is needed - we simply release blocks and do not touch anything
  728  * reachable from inode.
  729  *
  730  * akpm: `handle' can be NULL if create == 0.
  731  *
  732  * The BKL may not be held on entry here.  Be sure to take it early.
  733  */
  734 
  735 static int ext3_get_block_handle(handle_t *handle, struct inode *inode, 
  736                                  long iblock,
  737                                  struct buffer_head *bh_result, int create)
  738 {
  739         int err = -EIO;
  740         int offsets[4];
  741         Indirect chain[4];
  742         Indirect *partial;
  743         unsigned long goal;
  744         int left;
  745         int depth = ext3_block_to_path(inode, iblock, offsets);
  746         loff_t new_size;
  747 
  748         J_ASSERT(handle != NULL || create == 0);
  749 
  750         if (depth == 0)
  751                 goto out;
  752 
  753         lock_kernel();
  754 reread:
  755         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
  756 
  757         /* Simplest case - block found, no allocation needed */
  758         if (!partial) {
  759                 bh_result->b_state &= ~(1UL << BH_New);
  760 got_it:
  761                 bh_result->b_dev = inode->i_dev;
  762                 bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
  763                 bh_result->b_state |= (1UL << BH_Mapped);
  764                 /* Clean up and exit */
  765                 partial = chain+depth-1; /* the whole chain */
  766                 goto cleanup;
  767         }
  768 
  769         /* Next simple case - plain lookup or failed read of indirect block */
  770         if (!create || err == -EIO) {
  771 cleanup:
  772                 while (partial > chain) {
  773                         BUFFER_TRACE(partial->bh, "call brelse");
  774                         brelse(partial->bh);
  775                         partial--;
  776                 }
  777                 BUFFER_TRACE(bh_result, "returned");
  778                 unlock_kernel();
  779 out:
  780                 return err;
  781         }
  782 
  783         /*
  784          * Indirect block might be removed by truncate while we were
  785          * reading it. Handling of that case (forget what we've got and
  786          * reread) is taken out of the main path.
  787          */
  788         if (err == -EAGAIN)
  789                 goto changed;
  790 
  791         if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
  792                 goto changed;
  793 
  794         left = (chain + depth) - partial;
  795 
  796         /*
  797          * Block out ext3_truncate while we alter the tree
  798          */
  799         down_read(&inode->u.ext3_i.truncate_sem);
  800         err = ext3_alloc_branch(handle, inode, left, goal,
  801                                         offsets+(partial-chain), partial);
  802 
  803         /* The ext3_splice_branch call will free and forget any buffers
  804          * on the new chain if there is a failure, but that risks using
  805          * up transaction credits, especially for bitmaps where the
  806          * credits cannot be returned.  Can we handle this somehow?  We
  807          * may need to return -EAGAIN upwards in the worst case.  --sct */
  808         if (!err)
  809                 err = ext3_splice_branch(handle, inode, iblock, chain,
  810                                          partial, left);
  811         up_read(&inode->u.ext3_i.truncate_sem);
  812         if (err == -EAGAIN)
  813                 goto changed;
  814         if (err)
  815                 goto cleanup;
  816 
  817         new_size = inode->i_size;
  818         /*
  819          * This is not racy against ext3_truncate's modification of i_disksize
  820          * because VM/VFS ensures that the file cannot be extended while
  821          * truncate is in progress.  It is racy between multiple parallel
  822          * instances of get_block, but we have the BKL.
  823          */
  824         if (new_size > inode->u.ext3_i.i_disksize)
  825                 inode->u.ext3_i.i_disksize = new_size;
  826 
  827         bh_result->b_state |= (1UL << BH_New);
  828         goto got_it;
  829 
  830 changed:
  831         while (partial > chain) {
  832                 jbd_debug(1, "buffer chain changed, retrying\n");
  833                 BUFFER_TRACE(partial->bh, "brelsing");
  834                 brelse(partial->bh);
  835                 partial--;
  836         }
  837         goto reread;
  838 }
  839 
  840 /*
  841  * The BKL is not held on entry here.
  842  */
  843 static int ext3_get_block(struct inode *inode, long iblock,
  844                         struct buffer_head *bh_result, int create)
  845 {
  846         handle_t *handle = 0;
  847         int ret;
  848 
  849         if (create) {
  850                 handle = ext3_journal_current_handle();
  851                 J_ASSERT(handle != 0);
  852         }
  853         ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
  854         return ret;
  855 }
  856 
  857 /*
  858  * `handle' can be NULL if create is zero
  859  */
  860 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
  861                                 long block, int create, int * errp)
  862 {
  863         struct buffer_head dummy;
  864         int fatal = 0, err;
  865         
  866         J_ASSERT(handle != NULL || create == 0);
  867 
  868         dummy.b_state = 0;
  869         dummy.b_blocknr = -1000;
  870         buffer_trace_init(&dummy.b_history);
  871         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
  872         if (!*errp && buffer_mapped(&dummy)) {
  873                 struct buffer_head *bh;
  874                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
  875                 if (buffer_new(&dummy)) {
  876                         J_ASSERT(create != 0);
  877                         J_ASSERT(handle != 0);
  878 
  879                         /* Now that we do not always journal data, we
  880                            should keep in mind whether this should
  881                            always journal the new buffer as metadata.
  882                            For now, regular file writes use
  883                            ext3_get_block instead, so it's not a
  884                            problem. */
  885                         lock_kernel();
  886                         lock_buffer(bh);
  887                         BUFFER_TRACE(bh, "call get_create_access");
  888                         fatal = ext3_journal_get_create_access(handle, bh);
  889                         if (!fatal) {
  890                                 memset(bh->b_data, 0,
  891                                        inode->i_sb->s_blocksize);
  892                                 mark_buffer_uptodate(bh, 1);
  893                         }
  894                         unlock_buffer(bh);
  895                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
  896                         err = ext3_journal_dirty_metadata(handle, bh);
  897                         if (!fatal) fatal = err;
  898                         unlock_kernel();
  899                 } else {
  900                         BUFFER_TRACE(bh, "not a new buffer");
  901                 }
  902                 if (fatal) {
  903                         *errp = fatal;
  904                         brelse(bh);
  905                         bh = NULL;
  906                 }
  907                 return bh;
  908         }
  909         return NULL;
  910 }
  911 
  912 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
  913                                int block, int create, int *err)
  914 {
  915         struct buffer_head * bh;
  916         int prev_blocks;
  917 
  918         prev_blocks = inode->i_blocks;
  919 
  920         bh = ext3_getblk (handle, inode, block, create, err);
  921         if (!bh)
  922                 return bh;
  923 #ifdef EXT3_PREALLOCATE
  924         /*
  925          * If the inode has grown, and this is a directory, then use a few
  926          * more of the preallocated blocks to keep directory fragmentation
  927          * down.  The preallocated blocks are guaranteed to be contiguous.
  928          */
  929         if (create &&
  930             S_ISDIR(inode->i_mode) &&
  931             inode->i_blocks > prev_blocks &&
  932             EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
  933                                     EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
  934                 int i;
  935                 struct buffer_head *tmp_bh;
  936 
  937                 for (i = 1;
  938                      inode->u.ext3_i.i_prealloc_count &&
  939                      i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
  940                      i++) {
  941                         /*
  942                          * ext3_getblk will zero out the contents of the
  943                          * directory for us
  944                          */
  945                         tmp_bh = ext3_getblk(handle, inode,
  946                                                 block+i, create, err);
  947                         if (!tmp_bh) {
  948                                 brelse (bh);
  949                                 return 0;
  950                         }
  951                         brelse (tmp_bh);
  952                 }
  953         }
  954 #endif
  955         if (buffer_uptodate(bh))
  956                 return bh;
  957         ll_rw_block (READ, 1, &bh);
  958         wait_on_buffer (bh);
  959         if (buffer_uptodate(bh))
  960                 return bh;
  961         brelse (bh);
  962         *err = -EIO;
  963         return NULL;
  964 }
  965 
  966 static int walk_page_buffers(   handle_t *handle,
  967                                 struct inode *inode,
  968                                 struct buffer_head *head,
  969                                 unsigned from,
  970                                 unsigned to,
  971                                 int *partial,
  972                                 int (*fn)(      handle_t *handle,
  973                                                 struct inode *inode,
  974                                                 struct buffer_head *bh))
  975 {
  976         struct buffer_head *bh;
  977         unsigned block_start, block_end;
  978         unsigned blocksize = head->b_size;
  979         int err, ret = 0;
  980 
  981         for (   bh = head, block_start = 0;
  982                 ret == 0 && (bh != head || !block_start);
  983                 block_start = block_end, bh = bh->b_this_page)
  984         {
  985                 block_end = block_start + blocksize;
  986                 if (block_end <= from || block_start >= to) {
  987                         if (partial && !buffer_uptodate(bh))
  988                                 *partial = 1;
  989                         continue;
  990                 }
  991                 err = (*fn)(handle, inode, bh);
  992                 if (!ret)
  993                         ret = err;
  994         }
  995         return ret;
  996 }
  997 
  998 /*
  999  * To preserve ordering, it is essential that the hole instantiation and
 1000  * the data write be encapsulated in a single transaction.  We cannot
 1001  * close off a transaction and start a new one between the ext3_get_block()
 1002  * and the commit_write().  So doing the journal_start at the start of
 1003  * prepare_write() is the right place.
 1004  *
 1005  * Also, this function can nest inside ext3_writepage() ->
 1006  * block_write_full_page(). In that case, we *know* that ext3_writepage()
 1007  * has generated enough buffer credits to do the whole page.  So we won't
 1008  * block on the journal in that case, which is good, because the caller may
 1009  * be PF_MEMALLOC.
 1010  *
 1011  * By accident, ext3 can be reentered when a transaction is open via
 1012  * quota file writes.  If we were to commit the transaction while thus
 1013  * reentered, there can be a deadlock - we would be holding a quota
 1014  * lock, and the commit would never complete if another thread had a
 1015  * transaction open and was blocking on the quota lock - a ranking
 1016  * violation.
 1017  *
 1018  * So what we do is to rely on the fact that journal_stop/journal_start
 1019  * will _not_ run commit under these circumstances because handle->h_ref
 1020  * is elevated.  We'll still have enough credits for the tiny quotafile
 1021  * write.  
 1022  */
 1023 
 1024 static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 1025                                        struct buffer_head *bh)
 1026 {
 1027         return ext3_journal_get_write_access(handle, bh);
 1028 }
 1029 
 1030 static int ext3_prepare_write(struct file *file, struct page *page,
 1031                               unsigned from, unsigned to)
 1032 {
 1033         struct inode *inode = page->mapping->host;
 1034         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
 1035         handle_t *handle;
 1036 
 1037         lock_kernel();
 1038         handle = ext3_journal_start(inode, needed_blocks);
 1039         if (IS_ERR(handle)) {
 1040                 ret = PTR_ERR(handle);
 1041                 goto out;
 1042         }
 1043         unlock_kernel();
 1044         ret = block_prepare_write(page, from, to, ext3_get_block);
 1045         lock_kernel();
 1046         if (ret != 0)
 1047                 goto prepare_write_failed;
 1048 
 1049         if (ext3_should_journal_data(inode)) {
 1050                 ret = walk_page_buffers(handle, inode, page->buffers,
 1051                                 from, to, NULL, do_journal_get_write_access);
 1052                 if (ret) {
 1053                         /*
 1054                          * We're going to fail this prepare_write(),
 1055                          * so commit_write() will not be called.
 1056                          * We need to undo block_prepare_write()'s kmap().
 1057                          * AKPM: Do we need to clear PageUptodate?  I don't
 1058                          * think so.
 1059                          */
 1060                         kunmap(page);
 1061                 }
 1062         }
 1063 prepare_write_failed:
 1064         if (ret)
 1065                 ext3_journal_stop(handle, inode);
 1066 out:
 1067         unlock_kernel();
 1068         return ret;
 1069 }
 1070 
 1071 static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
 1072                                    struct buffer_head *bh)
 1073 {
 1074         int ret = ext3_journal_dirty_data(handle, bh, 0);
 1075         buffer_insert_inode_data_queue(bh, inode);
 1076         return ret;
 1077 }
 1078 
 1079 /*
 1080  * For ext3_writepage().  We also brelse() the buffer to account for
 1081  * the bget() which ext3_writepage() performs.
 1082  */
 1083 static int journal_dirty_async_data(handle_t *handle, struct inode *inode, 
 1084                                     struct buffer_head *bh)
 1085 {
 1086         int ret = ext3_journal_dirty_data(handle, bh, 1);
 1087         buffer_insert_inode_data_queue(bh, inode);
 1088         __brelse(bh);
 1089         return ret;
 1090 }
 1091 
 1092 /* For commit_write() in data=journal mode */
 1093 static int commit_write_fn(handle_t *handle, struct inode *inode, 
 1094                            struct buffer_head *bh)
 1095 {
 1096         set_bit(BH_Uptodate, &bh->b_state);
 1097         return ext3_journal_dirty_metadata(handle, bh);
 1098 }
 1099 
 1100 /*
 1101  * We need to pick up the new inode size which generic_commit_write gave us
 1102  * `file' can be NULL - eg, when called from block_symlink().
 1103  *
 1104  * ext3 inode->i_dirty_buffers policy:  If we're journalling data we
 1105  * definitely don't want them to appear on the inode at all - instead
 1106  * we need to manage them at the JBD layer and we need to intercept
 1107  * the relevant sync operations and translate them into journal operations.
 1108  *
 1109  * If we're not journalling data then we can just leave the buffers
 1110  * on ->i_dirty_buffers.  If someone writes them out for us then thanks.
 1111  * Otherwise we'll do it in commit, if we're using ordered data.
 1112  */
 1113 
 1114 static int ext3_commit_write(struct file *file, struct page *page,
 1115                              unsigned from, unsigned to)
 1116 {
 1117         handle_t *handle = ext3_journal_current_handle();
 1118         struct inode *inode = page->mapping->host;
 1119         int ret = 0, ret2;
 1120 
 1121         lock_kernel();
 1122         if (ext3_should_journal_data(inode)) {
 1123                 /*
 1124                  * Here we duplicate the generic_commit_write() functionality
 1125                  */
 1126                 int partial = 0;
 1127                 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 1128 
 1129                 ret = walk_page_buffers(handle, inode, page->buffers,
 1130                         from, to, &partial, commit_write_fn);
 1131                 if (!partial)
 1132                         SetPageUptodate(page);
 1133                 kunmap(page);
 1134                 if (pos > inode->i_size)
 1135                         inode->i_size = pos;
 1136                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 1137         } else {
 1138                 if (ext3_should_order_data(inode)) {
 1139                         ret = walk_page_buffers(handle, inode, page->buffers,
 1140                                 from, to, NULL, journal_dirty_sync_data);
 1141                 }
 1142                 /* Be careful here if generic_commit_write becomes a
 1143                  * required invocation after block_prepare_write. */
 1144                 if (ret == 0) {
 1145                         ret = generic_commit_write(file, page, from, to);
 1146                 } else {
 1147                         /*
 1148                          * block_prepare_write() was called, but we're not
 1149                          * going to call generic_commit_write().  So we
 1150                          * need to perform generic_commit_write()'s kunmap
 1151                          * by hand.
 1152                          */
 1153                         kunmap(page);
 1154                 }
 1155         }
 1156         if (inode->i_size > inode->u.ext3_i.i_disksize) {
 1157                 inode->u.ext3_i.i_disksize = inode->i_size;
 1158                 ret2 = ext3_mark_inode_dirty(handle, inode);
 1159                 if (!ret) 
 1160                         ret = ret2;
 1161         }
 1162         ret2 = ext3_journal_stop(handle, inode);
 1163         unlock_kernel();
 1164         if (!ret)
 1165                 ret = ret2;
 1166         return ret;
 1167 }
 1168 
 1169 /* 
 1170  * bmap() is special.  It gets used by applications such as lilo and by
 1171  * the swapper to find the on-disk block of a specific piece of data.
 1172  *
 1173  * Naturally, this is dangerous if the block concerned is still in the
 1174  * journal.  If somebody makes a swapfile on an ext3 data-journaling
 1175  * filesystem and enables swap, then they may get a nasty shock when the
 1176  * data getting swapped to that swapfile suddenly gets overwritten by
 1177  * the original zero's written out previously to the journal and
 1178  * awaiting writeback in the kernel's buffer cache. 
 1179  *
 1180  * So, if we see any bmap calls here on a modified, data-journaled file,
 1181  * take extra steps to flush any blocks which might be in the cache. 
 1182  */
 1183 static int ext3_bmap(struct address_space *mapping, long block)
 1184 {
 1185         struct inode *inode = mapping->host;
 1186         journal_t *journal;
 1187         int err;
 1188         
 1189         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
 1190                 /* 
 1191                  * This is a REALLY heavyweight approach, but the use of
 1192                  * bmap on dirty files is expected to be extremely rare:
 1193                  * only if we run lilo or swapon on a freshly made file
 1194                  * do we expect this to happen. 
 1195                  *
 1196                  * (bmap requires CAP_SYS_RAWIO so this does not
 1197                  * represent an unprivileged user DOS attack --- we'd be
 1198                  * in trouble if mortal users could trigger this path at
 1199                  * will.) 
 1200                  *
 1201                  * NB. EXT3_STATE_JDATA is not set on files other than
 1202                  * regular files.  If somebody wants to bmap a directory
 1203                  * or symlink and gets confused because the buffer
 1204                  * hasn't yet been flushed to disk, they deserve
 1205                  * everything they get.
 1206                  */
 1207                 
 1208                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
 1209                 journal = EXT3_JOURNAL(inode);
 1210                 journal_lock_updates(journal);
 1211                 err = journal_flush(journal);
 1212                 journal_unlock_updates(journal);
 1213                 
 1214                 if (err)
 1215                         return 0;
 1216         }
 1217         
 1218         return generic_block_bmap(mapping,block,ext3_get_block);
 1219 }
 1220 
 1221 static int bget_one(handle_t *handle, struct inode *inode, 
 1222                     struct buffer_head *bh)
 1223 {
 1224         atomic_inc(&bh->b_count);
 1225         return 0;
 1226 }
 1227 
 1228 /*
 1229  * Note that we always start a transaction even if we're not journalling
 1230  * data.  This is to preserve ordering: any hole instantiation within
 1231  * __block_write_full_page -> ext3_get_block() should be journalled
 1232  * along with the data so we don't crash and then get metadata which
 1233  * refers to old data.
 1234  *
 1235  * In all journalling modes block_write_full_page() will start the I/O.
 1236  *
 1237  * Problem:
 1238  *
 1239  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
 1240  *              ext3_writepage()
 1241  *
 1242  * Similar for:
 1243  *
 1244  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
 1245  *
 1246  * Same applies to ext3_get_block().  We will deadlock on various things like
 1247  * lock_journal and i_truncate_sem.
 1248  *
 1249  * Setting PF_MEMALLOC here doesn't work - too many internal memory
 1250  * allocations fail.
 1251  *
 1252  * 16May01: If we're reentered then journal_current_handle() will be
 1253  *          non-zero. We simply *return*.
 1254  *
 1255  * 1 July 2001: @@@ FIXME:
 1256  *   In journalled data mode, a data buffer may be metadata against the
 1257  *   current transaction.  But the same file is part of a shared mapping
 1258  *   and someone does a writepage() on it.
 1259  *
 1260  *   We will move the buffer onto the async_data list, but *after* it has
 1261  *   been dirtied. So there's a small window where we have dirty data on
 1262  *   BJ_Metadata.
 1263  *
 1264  *   Note that this only applies to the last partial page in the file.  The
 1265  *   bit which block_write_full_page() uses prepare/commit for.  (That's
 1266  *   broken code anyway: it's wrong for msync()).
 1267  *
 1268  *   It's a rare case: affects the final partial page, for journalled data
 1269  *   where the file is subject to bith write() and writepage() in the same
 1270  *   transction.  To fix it we'll need a custom block_write_full_page().
 1271  *   We'll probably need that anyway for journalling writepage() output.
 1272  *
 1273  * We don't honour synchronous mounts for writepage().  That would be
 1274  * disastrous.  Any write() or metadata operation will sync the fs for
 1275  * us.
 1276  */
 1277 static int ext3_writepage(struct page *page)
 1278 {
 1279         struct inode *inode = page->mapping->host;
 1280         struct buffer_head *page_buffers;
 1281         handle_t *handle = NULL;
 1282         int ret = 0, err;
 1283         int needed;
 1284         int order_data;
 1285 
 1286         J_ASSERT(PageLocked(page));
 1287         
 1288         /*
 1289          * We give up here if we're reentered, because it might be
 1290          * for a different filesystem.  One *could* look for a
 1291          * nested transaction opportunity.
 1292          */
 1293         lock_kernel();
 1294         if (ext3_journal_current_handle())
 1295                 goto out_fail;
 1296 
 1297         needed = ext3_writepage_trans_blocks(inode);
 1298         if (current->flags & PF_MEMALLOC)
 1299                 handle = ext3_journal_try_start(inode, needed);
 1300         else
 1301                 handle = ext3_journal_start(inode, needed);
 1302                                 
 1303         if (IS_ERR(handle)) {
 1304                 ret = PTR_ERR(handle);
 1305                 goto out_fail;
 1306         }
 1307 
 1308         order_data = ext3_should_order_data(inode) ||
 1309                         ext3_should_journal_data(inode);
 1310 
 1311         unlock_kernel();
 1312 
 1313         page_buffers = NULL;    /* Purely to prevent compiler warning */
 1314 
 1315         /* bget() all the buffers */
 1316         if (order_data) {
 1317                 if (!page->buffers)
 1318                         create_empty_buffers(page,
 1319                                 inode->i_dev, inode->i_sb->s_blocksize);
 1320                 page_buffers = page->buffers;
 1321                 walk_page_buffers(handle, inode, page_buffers, 0,
 1322                                 PAGE_CACHE_SIZE, NULL, bget_one);
 1323         }
 1324 
 1325         ret = block_write_full_page(page, ext3_get_block);
 1326 
 1327         /*
 1328          * The page can become unlocked at any point now, and
 1329          * truncate can then come in and change things.  So we
 1330          * can't touch *page from now on.  But *page_buffers is
 1331          * safe due to elevated refcount.
 1332          */
 1333 
 1334         handle = ext3_journal_current_handle();
 1335         lock_kernel();
 1336 
 1337         /* And attach them to the current transaction */
 1338         if (order_data) {
 1339                 err = walk_page_buffers(handle, inode, page_buffers,
 1340                         0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
 1341                 if (!ret)
 1342                         ret = err;
 1343         }
 1344 
 1345         err = ext3_journal_stop(handle, inode);
 1346         if (!ret)
 1347                 ret = err;
 1348         unlock_kernel();
 1349         return ret;
 1350 
 1351 out_fail:
 1352         
 1353         unlock_kernel();
 1354         SetPageDirty(page);
 1355         UnlockPage(page);
 1356         return ret;
 1357 }
 1358 
 1359 static int ext3_readpage(struct file *file, struct page *page)
 1360 {
 1361         return block_read_full_page(page,ext3_get_block);
 1362 }
 1363 
 1364 
 1365 static int ext3_flushpage(struct page *page, unsigned long offset)
 1366 {
 1367         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 1368         return journal_flushpage(journal, page, offset);
 1369 }
 1370 
 1371 static int ext3_releasepage(struct page *page, int wait)
 1372 {
 1373         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 1374         return journal_try_to_free_buffers(journal, page, wait);
 1375 }
 1376 
 1377 
 1378 struct address_space_operations ext3_aops = {
 1379         readpage:       ext3_readpage,          /* BKL not held.  Don't need */
 1380         writepage:      ext3_writepage,         /* BKL not held.  We take it */
 1381         sync_page:      block_sync_page,
 1382         prepare_write:  ext3_prepare_write,     /* BKL not held.  We take it */
 1383         commit_write:   ext3_commit_write,      /* BKL not held.  We take it */
 1384         bmap:           ext3_bmap,              /* BKL held */
 1385         flushpage:      ext3_flushpage,         /* BKL not held.  Don't need */
 1386         releasepage:    ext3_releasepage,       /* BKL not held.  Don't need */
 1387 };
 1388 
 1389 /*
 1390  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
 1391  * up to the end of the block which corresponds to `from'.
 1392  * This required during truncate. We need to physically zero the tail end
 1393  * of that block so it doesn't yield old data if the file is later grown.
 1394  */
 1395 static int ext3_block_truncate_page(handle_t *handle,
 1396                 struct address_space *mapping, loff_t from)
 1397 {
 1398         unsigned long index = from >> PAGE_CACHE_SHIFT;
 1399         unsigned offset = from & (PAGE_CACHE_SIZE-1);
 1400         unsigned blocksize, iblock, length, pos;
 1401         struct inode *inode = mapping->host;
 1402         struct page *page;
 1403         struct buffer_head *bh;
 1404         int err;
 1405 
 1406         blocksize = inode->i_sb->s_blocksize;
 1407         length = offset & (blocksize - 1);
 1408 
 1409         /* Block boundary? Nothing to do */
 1410         if (!length)
 1411                 return 0;
 1412 
 1413         length = blocksize - length;
 1414         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 1415 
 1416         page = find_or_create_page(mapping, index, GFP_NOFS);
 1417         err = -ENOMEM;
 1418         if (!page)
 1419                 goto out;
 1420 
 1421         if (!page->buffers)
 1422                 create_empty_buffers(page, inode->i_dev, blocksize);
 1423 
 1424         /* Find the buffer that contains "offset" */
 1425         bh = page->buffers;
 1426         pos = blocksize;
 1427         while (offset >= pos) {
 1428                 bh = bh->b_this_page;
 1429                 iblock++;
 1430                 pos += blocksize;
 1431         }
 1432 
 1433         err = 0;
 1434         if (!buffer_mapped(bh)) {
 1435                 /* Hole? Nothing to do */
 1436                 if (buffer_uptodate(bh))
 1437                         goto unlock;
 1438                 ext3_get_block(inode, iblock, bh, 0);
 1439                 /* Still unmapped? Nothing to do */
 1440                 if (!buffer_mapped(bh))
 1441                         goto unlock;
 1442         }
 1443 
 1444         /* Ok, it's mapped. Make sure it's up-to-date */
 1445         if (Page_Uptodate(page))
 1446                 set_bit(BH_Uptodate, &bh->b_state);
 1447 
 1448         if (!buffer_uptodate(bh)) {
 1449                 err = -EIO;
 1450                 ll_rw_block(READ, 1, &bh);
 1451                 wait_on_buffer(bh);
 1452                 /* Uhhuh. Read error. Complain and punt. */
 1453                 if (!buffer_uptodate(bh))
 1454                         goto unlock;
 1455         }
 1456 
 1457         if (ext3_should_journal_data(inode)) {
 1458                 BUFFER_TRACE(bh, "get write access");
 1459                 err = ext3_journal_get_write_access(handle, bh);
 1460                 if (err)
 1461                         goto unlock;
 1462         }
 1463         
 1464         memset(kmap(page) + offset, 0, length);
 1465         flush_dcache_page(page);
 1466         kunmap(page);
 1467 
 1468         BUFFER_TRACE(bh, "zeroed end of block");
 1469 
 1470         err = 0;
 1471         if (ext3_should_journal_data(inode)) {
 1472                 err = ext3_journal_dirty_metadata(handle, bh);
 1473         } else {
 1474                 if (ext3_should_order_data(inode))
 1475                         err = ext3_journal_dirty_data(handle, bh, 0);
 1476                 __mark_buffer_dirty(bh);
 1477         }
 1478 
 1479 unlock:
 1480         UnlockPage(page);
 1481         page_cache_release(page);
 1482 out:
 1483         return err;
 1484 }
 1485 
 1486 /*
 1487  * Probably it should be a library function... search for first non-zero word
 1488  * or memcmp with zero_page, whatever is better for particular architecture.
 1489  * Linus?
 1490  */
 1491 static inline int all_zeroes(u32 *p, u32 *q)
 1492 {
 1493         while (p < q)
 1494                 if (*p++)
 1495                         return 0;
 1496         return 1;
 1497 }
 1498 
 1499 /**
 1500  *      ext3_find_shared - find the indirect blocks for partial truncation.
 1501  *      @inode:   inode in question
 1502  *      @depth:   depth of the affected branch
 1503  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
 1504  *      @chain:   place to store the pointers to partial indirect blocks
 1505  *      @top:     place to the (detached) top of branch
 1506  *
 1507  *      This is a helper function used by ext3_truncate().
 1508  *
 1509  *      When we do truncate() we may have to clean the ends of several
 1510  *      indirect blocks but leave the blocks themselves alive. Block is
 1511  *      partially truncated if some data below the new i_size is refered
 1512  *      from it (and it is on the path to the first completely truncated
 1513  *      data block, indeed).  We have to free the top of that path along
 1514  *      with everything to the right of the path. Since no allocation
 1515  *      past the truncation point is possible until ext3_truncate()
 1516  *      finishes, we may safely do the latter, but top of branch may
 1517  *      require special attention - pageout below the truncation point
 1518  *      might try to populate it.
 1519  *
 1520  *      We atomically detach the top of branch from the tree, store the
 1521  *      block number of its root in *@top, pointers to buffer_heads of
 1522  *      partially truncated blocks - in @chain[].bh and pointers to
 1523  *      their last elements that should not be removed - in
 1524  *      @chain[].p. Return value is the pointer to last filled element
 1525  *      of @chain.
 1526  *
 1527  *      The work left to caller to do the actual freeing of subtrees:
 1528  *              a) free the subtree starting from *@top
 1529  *              b) free the subtrees whose roots are stored in
 1530  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
 1531  *              c) free the subtrees growing from the inode past the @chain[0].
 1532  *                      (no partially truncated stuff there).  */
 1533 
 1534 static Indirect *ext3_find_shared(struct inode *inode,
 1535                                 int depth,
 1536                                 int offsets[4],
 1537                                 Indirect chain[4],
 1538                                 u32 *top)
 1539 {
 1540         Indirect *partial, *p;
 1541         int k, err;
 1542 
 1543         *top = 0;
 1544         /* Make k index the deepest non-null offest + 1 */
 1545         for (k = depth; k > 1 && !offsets[k-1]; k--)
 1546                 ;
 1547         partial = ext3_get_branch(inode, k, offsets, chain, &err);
 1548         /* Writer: pointers */
 1549         if (!partial)
 1550                 partial = chain + k-1;
 1551         /*
 1552          * If the branch acquired continuation since we've looked at it -
 1553          * fine, it should all survive and (new) top doesn't belong to us.
 1554          */
 1555         if (!partial->key && *partial->p)
 1556                 /* Writer: end */
 1557                 goto no_top;
 1558         for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
 1559                 ;
 1560         /*
 1561          * OK, we've found the last block that must survive. The rest of our
 1562          * branch should be detached before unlocking. However, if that rest
 1563          * of branch is all ours and does not grow immediately from the inode
 1564          * it's easier to cheat and just decrement partial->p.
 1565          */
 1566         if (p == chain + k - 1 && p > chain) {
 1567                 p->p--;
 1568         } else {
 1569                 *top = *p->p;
 1570                 /* Nope, don't do this in ext3.  Must leave the tree intact */
 1571 #if 0
 1572                 *p->p = 0;
 1573 #endif
 1574         }
 1575         /* Writer: end */
 1576 
 1577         while(partial > p)
 1578         {
 1579                 brelse(partial->bh);
 1580                 partial--;
 1581         }
 1582 no_top:
 1583         return partial;
 1584 }
 1585 
 1586 /*
 1587  * Zero a number of block pointers in either an inode or an indirect block.
 1588  * If we restart the transaction we must again get write access to the
 1589  * indirect block for further modification.
 1590  *
 1591  * We release `count' blocks on disk, but (last - first) may be greater
 1592  * than `count' because there can be holes in there.
 1593  */
 1594 static void
 1595 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
 1596                 unsigned long block_to_free, unsigned long count,
 1597                 u32 *first, u32 *last)
 1598 {
 1599         u32 *p;
 1600         if (try_to_extend_transaction(handle, inode)) {
 1601                 if (bh) {
 1602                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 1603                         ext3_journal_dirty_metadata(handle, bh);
 1604                 }
 1605                 ext3_mark_inode_dirty(handle, inode);
 1606                 ext3_journal_test_restart(handle, inode);
 1607                 if (bh) {
 1608                         BUFFER_TRACE(bh, "retaking write access");
 1609                         ext3_journal_get_write_access(handle, bh);
 1610                 }
 1611         }
 1612 
 1613         /*
 1614          * Any buffers which are on the journal will be in memory. We find
 1615          * them on the hash table so journal_revoke() will run journal_forget()
 1616          * on them.  We've already detached each block from the file, so
 1617          * bforget() in journal_forget() should be safe.
 1618          *
 1619          * AKPM: turn on bforget in journal_forget()!!!
 1620          */
 1621         for (p = first; p < last; p++) {
 1622                 u32 nr = le32_to_cpu(*p);
 1623                 if (nr) {
 1624                         struct buffer_head *bh;
 1625 
 1626                         *p = 0;
 1627                         bh = sb_get_hash_table(inode->i_sb, nr);
 1628                         ext3_forget(handle, 0, inode, bh, nr);
 1629                 }
 1630         }
 1631 
 1632         ext3_free_blocks(handle, inode, block_to_free, count);
 1633 }
 1634 
 1635 /**
 1636  * ext3_free_data - free a list of data blocks
 1637  * @handle:     handle for this transaction
 1638  * @inode:      inode we are dealing with
 1639  * @this_bh:    indirect buffer_head which contains *@first and *@last
 1640  * @first:      array of block numbers
 1641  * @last:       points immediately past the end of array
 1642  *
 1643  * We are freeing all blocks refered from that array (numbers are stored as
 1644  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 1645  *
 1646  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
 1647  * blocks are contiguous then releasing them at one time will only affect one
 1648  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
 1649  * actually use a lot of journal space.
 1650  *
 1651  * @this_bh will be %NULL if @first and @last point into the inode's direct
 1652  * block pointers.
 1653  */
 1654 static void ext3_free_data(handle_t *handle, struct inode *inode,
 1655                            struct buffer_head *this_bh, u32 *first, u32 *last)
 1656 {
 1657         unsigned long block_to_free = 0;    /* Starting block # of a run */
 1658         unsigned long count = 0;            /* Number of blocks in the run */ 
 1659         u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
 1660                                                corresponding to
 1661                                                block_to_free */
 1662         unsigned long nr;                   /* Current block # */
 1663         u32 *p;                             /* Pointer into inode/ind
 1664                                                for current block */
 1665         int err;
 1666 
 1667         if (this_bh) {                          /* For indirect block */
 1668                 BUFFER_TRACE(this_bh, "get_write_access");
 1669                 err = ext3_journal_get_write_access(handle, this_bh);
 1670                 /* Important: if we can't update the indirect pointers
 1671                  * to the blocks, we can't free them. */
 1672                 if (err)
 1673                         return;
 1674         }
 1675 
 1676         for (p = first; p < last; p++) {
 1677                 nr = le32_to_cpu(*p);
 1678                 if (nr) {
 1679                         /* accumulate blocks to free if they're contiguous */
 1680                         if (count == 0) {
 1681                                 block_to_free = nr;
 1682                                 block_to_free_p = p;
 1683                                 count = 1;
 1684                         } else if (nr == block_to_free + count) {
 1685                                 count++;
 1686                         } else {
 1687                                 ext3_clear_blocks(handle, inode, this_bh, 
 1688                                                   block_to_free,
 1689                                                   count, block_to_free_p, p);
 1690                                 block_to_free = nr;
 1691                                 block_to_free_p = p;
 1692                                 count = 1;
 1693                         }
 1694                 }
 1695         }
 1696 
 1697         if (count > 0)
 1698                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
 1699                                   count, block_to_free_p, p);
 1700 
 1701         if (this_bh) {
 1702                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
 1703                 ext3_journal_dirty_metadata(handle, this_bh);
 1704         }
 1705 }
 1706 
 1707 /**
 1708  *      ext3_free_branches - free an array of branches
 1709  *      @handle: JBD handle for this transaction
 1710  *      @inode: inode we are dealing with
 1711  *      @parent_bh: the buffer_head which contains *@first and *@last
 1712  *      @first: array of block numbers
 1713  *      @last:  pointer immediately past the end of array
 1714  *      @depth: depth of the branches to free
 1715  *
 1716  *      We are freeing all blocks refered from these branches (numbers are
 1717  *      stored as little-endian 32-bit) and updating @inode->i_blocks
 1718  *      appropriately.
 1719  */
 1720 static void ext3_free_branches(handle_t *handle, struct inode *inode,
 1721                                struct buffer_head *parent_bh,
 1722                                u32 *first, u32 *last, int depth)
 1723 {
 1724         unsigned long nr;
 1725         u32 *p;
 1726 
 1727         if (is_handle_aborted(handle))
 1728                 return;
 1729         
 1730         if (depth--) {
 1731                 struct buffer_head *bh;
 1732                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 1733                 p = last;
 1734                 while (--p >= first) {
 1735                         nr = le32_to_cpu(*p);
 1736                         if (!nr)
 1737                                 continue;               /* A hole */
 1738 
 1739                         /* Go read the buffer for the next level down */
 1740                         bh = sb_bread(inode->i_sb, nr);
 1741 
 1742                         /*
 1743                          * A read failure? Report error and clear slot
 1744                          * (should be rare).
 1745                          */
 1746                         if (!bh) {
 1747                                 ext3_error(inode->i_sb, "ext3_free_branches",
 1748                                            "Read failure, inode=%ld, block=%ld",
 1749                                            inode->i_ino, nr);
 1750                                 continue;
 1751                         }
 1752 
 1753                         /* This zaps the entire block.  Bottom up. */
 1754                         BUFFER_TRACE(bh, "free child branches");
 1755                         ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
 1756                                            (u32*)bh->b_data + addr_per_block,
 1757                                            depth);
 1758 
 1759                         /*
 1760                          * We've probably journalled the indirect block several
 1761                          * times during the truncate.  But it's no longer
 1762                          * needed and we now drop it from the transaction via
 1763                          * journal_revoke().
 1764                          *
 1765                          * That's easy if it's exclusively part of this
 1766                          * transaction.  But if it's part of the committing
 1767                          * transaction then journal_forget() will simply
 1768                          * brelse() it.  That means that if the underlying
 1769                          * block is reallocated in ext3_get_block(),
 1770                          * unmap_underlying_metadata() will find this block
 1771                          * and will try to get rid of it.  damn, damn.
 1772                          *
 1773                          * If this block has already been committed to the
 1774                          * journal, a revoke record will be written.  And
 1775                          * revoke records must be emitted *before* clearing
 1776                          * this block's bit in the bitmaps.
 1777                          */
 1778                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
 1779 
 1780                         /*
 1781                          * Everything below this this pointer has been
 1782                          * released.  Now let this top-of-subtree go.
 1783                          *
 1784                          * We want the freeing of this indirect block to be
 1785                          * atomic in the journal with the updating of the
 1786                          * bitmap block which owns it.  So make some room in
 1787                          * the journal.
 1788                          *
 1789                          * We zero the parent pointer *after* freeing its
 1790                          * pointee in the bitmaps, so if extend_transaction()
 1791                          * for some reason fails to put the bitmap changes and
 1792                          * the release into the same transaction, recovery
 1793                          * will merely complain about releasing a free block,
 1794                          * rather than leaking blocks.
 1795                          */
 1796                         if (is_handle_aborted(handle))
 1797                                 return;
 1798                         if (try_to_extend_transaction(handle, inode)) {
 1799                                 ext3_mark_inode_dirty(handle, inode);
 1800                                 ext3_journal_test_restart(handle, inode);
 1801                         }
 1802 
 1803                         ext3_free_blocks(handle, inode, nr, 1);
 1804 
 1805                         if (parent_bh) {
 1806                                 /*
 1807                                  * The block which we have just freed is
 1808                                  * pointed to by an indirect block: journal it
 1809                                  */
 1810                                 BUFFER_TRACE(parent_bh, "get_write_access");
 1811                                 if (!ext3_journal_get_write_access(handle,
 1812                                                                    parent_bh)){
 1813                                         *p = 0;
 1814                                         BUFFER_TRACE(parent_bh,
 1815                                         "call ext3_journal_dirty_metadata");
 1816                                         ext3_journal_dirty_metadata(handle, 
 1817                                                                     parent_bh);
 1818                                 }
 1819                         }
 1820                 }
 1821         } else {
 1822                 /* We have reached the bottom of the tree. */
 1823                 BUFFER_TRACE(parent_bh, "free data blocks");
 1824                 ext3_free_data(handle, inode, parent_bh, first, last);
 1825         }
 1826 }
 1827 
 1828 /*
 1829  * ext3_truncate()
 1830  *
 1831  * We block out ext3_get_block() block instantiations across the entire
 1832  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
 1833  * simultaneously on behalf of the same inode.
 1834  *
 1835  * As we work through the truncate and commmit bits of it to the journal there
 1836  * is one core, guiding principle: the file's tree must always be consistent on
 1837  * disk.  We must be able to restart the truncate after a crash.
 1838  *
 1839  * The file's tree may be transiently inconsistent in memory (although it
 1840  * probably isn't), but whenever we close off and commit a journal transaction,
 1841  * the contents of (the filesystem + the journal) must be consistent and
 1842  * restartable.  It's pretty simple, really: bottom up, right to left (although
 1843  * left-to-right works OK too).
 1844  *
 1845  * Note that at recovery time, journal replay occurs *before* the restart of
 1846  * truncate against the orphan inode list.
 1847  *
 1848  * The committed inode has the new, desired i_size (which is the same as
 1849  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
 1850  * that this inode's truncate did not complete and it will again call
 1851  * ext3_truncate() to have another go.  So there will be instantiated blocks
 1852  * to the right of the truncation point in a crashed ext3 filesystem.  But
 1853  * that's fine - as long as they are linked from the inode, the post-crash
 1854  * ext3_truncate() run will find them and release them.
 1855  */
 1856 
 1857 void ext3_truncate(struct inode * inode)
 1858 {
 1859         handle_t *handle;
 1860         u32 *i_data = inode->u.ext3_i.i_data;
 1861         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 1862         int offsets[4];
 1863         Indirect chain[4];
 1864         Indirect *partial;
 1865         int nr = 0;
 1866         int n;
 1867         long last_block;
 1868         unsigned blocksize;
 1869 
 1870         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 1871             S_ISLNK(inode->i_mode)))
 1872                 return;
 1873         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 1874                 return;
 1875 
 1876         ext3_discard_prealloc(inode);
 1877 
 1878         handle = start_transaction(inode);
 1879         if (IS_ERR(handle))
 1880                 return;         /* AKPM: return what? */
 1881 
 1882         blocksize = inode->i_sb->s_blocksize;
 1883         last_block = (inode->i_size + blocksize-1)
 1884                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
 1885 
 1886         ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
 1887                 
 1888 
 1889         n = ext3_block_to_path(inode, last_block, offsets);
 1890         if (n == 0)
 1891                 goto out_stop;  /* error */
 1892 
 1893         /*
 1894          * OK.  This truncate is going to happen.  We add the inode to the
 1895          * orphan list, so that if this truncate spans multiple transactions,
 1896          * and we crash, we will resume the truncate when the filesystem
 1897          * recovers.  It also marks the inode dirty, to catch the new size.
 1898          *
 1899          * Implication: the file must always be in a sane, consistent
 1900          * truncatable state while each transaction commits.
 1901          */
 1902         if (ext3_orphan_add(handle, inode))
 1903                 goto out_stop;
 1904 
 1905         /*
 1906          * The orphan list entry will now protect us from any crash which
 1907          * occurs before the truncate completes, so it is now safe to propagate
 1908          * the new, shorter inode size (held for now in i_size) into the
 1909          * on-disk inode. We do this via i_disksize, which is the value which
 1910          * ext3 *really* writes onto the disk inode.
 1911          */
 1912         inode->u.ext3_i.i_disksize = inode->i_size;
 1913 
 1914         /*
 1915          * From here we block out all ext3_get_block() callers who want to
 1916          * modify the block allocation tree.
 1917          */
 1918         down_write(&inode->u.ext3_i.truncate_sem);
 1919 
 1920         if (n == 1) {           /* direct blocks */
 1921                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
 1922                                i_data + EXT3_NDIR_BLOCKS);
 1923                 goto do_indirects;
 1924         }
 1925 
 1926         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
 1927         /* Kill the top of shared branch (not detached) */
 1928         if (nr) {
 1929                 if (partial == chain) {
 1930                         /* Shared branch grows from the inode */
 1931                         ext3_free_branches(handle, inode, NULL,
 1932                                            &nr, &nr+1, (chain+n-1) - partial);
 1933                         *partial->p = 0;
 1934                         /*
 1935                          * We mark the inode dirty prior to restart,
 1936                          * and prior to stop.  No need for it here.
 1937                          */
 1938                 } else {
 1939                         /* Shared branch grows from an indirect block */
 1940                         BUFFER_TRACE(partial->bh, "get_write_access");
 1941                         ext3_free_branches(handle, inode, partial->bh,
 1942                                         partial->p,
 1943                                         partial->p+1, (chain+n-1) - partial);
 1944                 }
 1945         }
 1946         /* Clear the ends of indirect blocks on the shared branch */
 1947         while (partial > chain) {
 1948                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
 1949                                    (u32*)partial->bh->b_data + addr_per_block,
 1950                                    (chain+n-1) - partial);
 1951                 BUFFER_TRACE(partial->bh, "call brelse");
 1952                 brelse (partial->bh);
 1953                 partial--;
 1954         }
 1955 do_indirects:
 1956         /* Kill the remaining (whole) subtrees */
 1957         switch (offsets[0]) {
 1958                 default:
 1959                         nr = i_data[EXT3_IND_BLOCK];
 1960                         if (nr) {
 1961                                 ext3_free_branches(handle, inode, NULL,
 1962                                                    &nr, &nr+1, 1);
 1963                                 i_data[EXT3_IND_BLOCK] = 0;
 1964                         }
 1965                 case EXT3_IND_BLOCK:
 1966                         nr = i_data[EXT3_DIND_BLOCK];
 1967                         if (nr) {
 1968                                 ext3_free_branches(handle, inode, NULL,
 1969                                                    &nr, &nr+1, 2);
 1970                                 i_data[EXT3_DIND_BLOCK] = 0;
 1971                         }
 1972                 case EXT3_DIND_BLOCK:
 1973                         nr = i_data[EXT3_TIND_BLOCK];
 1974                         if (nr) {
 1975                                 ext3_free_branches(handle, inode, NULL,
 1976                                                    &nr, &nr+1, 3);
 1977                                 i_data[EXT3_TIND_BLOCK] = 0;
 1978                         }
 1979                 case EXT3_TIND_BLOCK:
 1980                         ;
 1981         }
 1982         up_write(&inode->u.ext3_i.truncate_sem);
 1983         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 1984         ext3_mark_inode_dirty(handle, inode);
 1985 
 1986         /* In a multi-transaction truncate, we only make the final
 1987          * transaction synchronous */
 1988         if (IS_SYNC(inode))
 1989                 handle->h_sync = 1;
 1990 out_stop:
 1991         /*
 1992          * If this was a simple ftruncate(), and the file will remain alive
 1993          * then we need to clear up the orphan record which we created above.
 1994          * However, if this was a real unlink then we were called by
 1995          * ext3_delete_inode(), and we allow that function to clean up the
 1996          * orphan info for us.
 1997          */
 1998         if (inode->i_nlink)
 1999                 ext3_orphan_del(handle, inode);
 2000 
 2001         ext3_journal_stop(handle, inode);
 2002 }
 2003 
 2004 /* 
 2005  * ext3_get_inode_loc returns with an extra refcount against the
 2006  * inode's underlying buffer_head on success. 
 2007  */
 2008 
 2009 int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
 2010 {
 2011         struct buffer_head *bh = 0;
 2012         unsigned long block;
 2013         unsigned long block_group;
 2014         unsigned long group_desc;
 2015         unsigned long desc;
 2016         unsigned long offset;
 2017         struct ext3_group_desc * gdp;
 2018                 
 2019         if ((inode->i_ino != EXT3_ROOT_INO &&
 2020                 inode->i_ino != EXT3_ACL_IDX_INO &&
 2021                 inode->i_ino != EXT3_ACL_DATA_INO &&
 2022                 inode->i_ino != EXT3_JOURNAL_INO &&
 2023                 inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
 2024                 inode->i_ino > le32_to_cpu(
 2025                         inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
 2026                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
 2027                             "bad inode number: %lu", inode->i_ino);
 2028                 goto bad_inode;
 2029         }
 2030         block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
 2031         if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
 2032                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
 2033                             "group >= groups count");
 2034                 goto bad_inode;
 2035         }
 2036         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
 2037         desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
 2038         bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
 2039         if (!bh) {
 2040                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
 2041                             "Descriptor not loaded");
 2042                 goto bad_inode;
 2043         }
 2044 
 2045         gdp = (struct ext3_group_desc *) bh->b_data;
 2046         /*
 2047          * Figure out the offset within the block group inode table
 2048          */
 2049         offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
 2050                 EXT3_INODE_SIZE(inode->i_sb);
 2051         block = le32_to_cpu(gdp[desc].bg_inode_table) +
 2052                 (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
 2053         if (!(bh = sb_bread(inode->i_sb, block))) {
 2054                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
 2055                             "unable to read inode block - "
 2056                             "inode=%lu, block=%lu", inode->i_ino, block);
 2057                 goto bad_inode;
 2058         }
 2059         offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
 2060 
 2061         iloc->bh = bh;
 2062         iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
 2063         iloc->block_group = block_group;
 2064         
 2065         return 0;
 2066         
 2067  bad_inode:
 2068         return -EIO;
 2069 }
 2070 
 2071 void ext3_set_inode_flags(struct inode *inode)
 2072 {
 2073         unsigned int flags = inode->u.ext3_i.i_flags;
 2074 
 2075         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME);
 2076         if (flags & EXT3_SYNC_FL)
 2077                 inode->i_flags |= S_SYNC;
 2078         if (flags & EXT3_APPEND_FL)
 2079                 inode->i_flags |= S_APPEND;
 2080         if (flags & EXT3_IMMUTABLE_FL)
 2081                 inode->i_flags |= S_IMMUTABLE;
 2082         if (flags & EXT3_NOATIME_FL)
 2083                 inode->i_flags |= S_NOATIME;
 2084 }
 2085 
 2086 
 2087 void ext3_read_inode(struct inode * inode)
 2088 {
 2089         struct ext3_iloc iloc;
 2090         struct ext3_inode *raw_inode;
 2091         struct buffer_head *bh;
 2092         int block;
 2093         
 2094         if(ext3_get_inode_loc(inode, &iloc))
 2095                 goto bad_inode;
 2096         bh = iloc.bh;
 2097         raw_inode = iloc.raw_inode;
 2098         init_rwsem(&inode->u.ext3_i.truncate_sem);
 2099         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 2100         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 2101         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
 2102         if(!(test_opt (inode->i_sb, NO_UID32))) {
 2103                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 2104                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 2105         }
 2106         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 2107         inode->i_size = le32_to_cpu(raw_inode->i_size);
 2108         inode->i_atime = le32_to_cpu(raw_inode->i_atime);
 2109         inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
 2110         inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
 2111         inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
 2112         /* We now have enough fields to check if the inode was active or not.
 2113          * This is needed because nfsd might try to access dead inodes
 2114          * the test is that same one that e2fsck uses
 2115          * NeilBrown 1999oct15
 2116          */
 2117         if (inode->i_nlink == 0) {
 2118                 if (inode->i_mode == 0 ||
 2119                     !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
 2120                         /* this inode is deleted */
 2121                         brelse (bh);
 2122                         goto bad_inode;
 2123                 }
 2124                 /* The only unlinked inodes we let through here have
 2125                  * valid i_mode and are being read by the orphan
 2126                  * recovery code: that's fine, we're about to complete
 2127                  * the process of deleting those. */
 2128         }
 2129         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
 2130                                          * (for stat), not the fs block
 2131                                          * size */  
 2132         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 2133         inode->i_version = ++event;
 2134         inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
 2135 #ifdef EXT3_FRAGMENTS
 2136         inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
 2137         inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
 2138         inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
 2139 #endif
 2140         inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
 2141         if (!S_ISREG(inode->i_mode)) {
 2142                 inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
 2143         } else {
 2144                 inode->i_size |=
 2145                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
 2146         }
 2147         inode->u.ext3_i.i_disksize = inode->i_size;
 2148         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 2149 #ifdef EXT3_PREALLOCATE
 2150         inode->u.ext3_i.i_prealloc_count = 0;
 2151 #endif
 2152         inode->u.ext3_i.i_block_group = iloc.block_group;
 2153 
 2154         /*
 2155          * NOTE! The in-memory inode i_data array is in little-endian order
 2156          * even on big-endian machines: we do NOT byteswap the block numbers!
 2157          */
 2158         for (block = 0; block < EXT3_N_BLOCKS; block++)
 2159                 inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
 2160         INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
 2161 
 2162         if (inode->i_ino == EXT3_ACL_IDX_INO ||
 2163             inode->i_ino == EXT3_ACL_DATA_INO)
 2164                 /* Nothing to do */ ;
 2165         else if (S_ISREG(inode->i_mode)) {
 2166                 inode->i_op = &ext3_file_inode_operations;
 2167                 inode->i_fop = &ext3_file_operations;
 2168                 inode->i_mapping->a_ops = &ext3_aops;
 2169         } else if (S_ISDIR(inode->i_mode)) {
 2170                 inode->i_op = &ext3_dir_inode_operations;
 2171                 inode->i_fop = &ext3_dir_operations;
 2172         } else if (S_ISLNK(inode->i_mode)) {
 2173                 if (!inode->i_blocks)
 2174                         inode->i_op = &ext3_fast_symlink_inode_operations;
 2175                 else {
 2176                         inode->i_op = &page_symlink_inode_operations;
 2177                         inode->i_mapping->a_ops = &ext3_aops;
 2178                 }
 2179         } else 
 2180                 init_special_inode(inode, inode->i_mode,
 2181                                    le32_to_cpu(iloc.raw_inode->i_block[0]));
 2182         brelse(iloc.bh);
 2183         ext3_set_inode_flags(inode);
 2184         return;
 2185         
 2186 bad_inode:
 2187         make_bad_inode(inode);
 2188         return;
 2189 }
 2190 
 2191 /*
 2192  * Post the struct inode info into an on-disk inode location in the
 2193  * buffer-cache.  This gobbles the caller's reference to the
 2194  * buffer_head in the inode location struct.  
 2195  */
 2196 
 2197 static int ext3_do_update_inode(handle_t *handle, 
 2198                                 struct inode *inode, 
 2199                                 struct ext3_iloc *iloc)
 2200 {
 2201         struct ext3_inode *raw_inode = iloc->raw_inode;
 2202         struct buffer_head *bh = iloc->bh;
 2203         int err = 0, rc, block;
 2204 
 2205         if (handle) {
 2206                 BUFFER_TRACE(bh, "get_write_access");
 2207                 err = ext3_journal_get_write_access(handle, bh);
 2208                 if (err)
 2209                         goto out_brelse;
 2210         }
 2211         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 2212         if(!(test_opt(inode->i_sb, NO_UID32))) {
 2213                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 2214                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 2215 /*
 2216  * Fix up interoperability with old kernels. Otherwise, old inodes get
 2217  * re-used with the upper 16 bits of the uid/gid intact
 2218  */
 2219                 if(!inode->u.ext3_i.i_dtime) {
 2220                         raw_inode->i_uid_high =
 2221                                 cpu_to_le16(high_16_bits(inode->i_uid));
 2222                         raw_inode->i_gid_high =
 2223                                 cpu_to_le16(high_16_bits(inode->i_gid));
 2224                 } else {
 2225                         raw_inode->i_uid_high = 0;
 2226                         raw_inode->i_gid_high = 0;
 2227                 }
 2228         } else {
 2229                 raw_inode->i_uid_low =
 2230                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
 2231                 raw_inode->i_gid_low =
 2232                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
 2233                 raw_inode->i_uid_high = 0;
 2234                 raw_inode->i_gid_high = 0;
 2235         }
 2236         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 2237         raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
 2238         raw_inode->i_atime = cpu_to_le32(inode->i_atime);
 2239         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
 2240         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
 2241         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 2242         raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
 2243         raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
 2244 #ifdef EXT3_FRAGMENTS
 2245         raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
 2246         raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
 2247         raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
 2248 #else
 2249         /* If we are not tracking these fields in the in-memory inode,
 2250          * then preserve them on disk, but still initialise them to zero
 2251          * for new inodes. */
 2252         if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
 2253                 raw_inode->i_faddr = 0;
 2254                 raw_inode->i_frag = 0;
 2255                 raw_inode->i_fsize = 0;
 2256         }
 2257 #endif
 2258         raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
 2259         if (!S_ISREG(inode->i_mode)) {
 2260                 raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
 2261         } else {
 2262                 raw_inode->i_size_high =
 2263                         cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
 2264                 if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
 2265                         struct super_block *sb = inode->i_sb;
 2266                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
 2267                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
 2268                             EXT3_SB(sb)->s_es->s_rev_level ==
 2269                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
 2270                                /* If this is the first large file
 2271                                 * created, add a flag to the superblock.
 2272                                 */
 2273                                 err = ext3_journal_get_write_access(handle,
 2274                                                 sb->u.ext3_sb.s_sbh);
 2275                                 if (err)
 2276                                         goto out_brelse;
 2277                                 ext3_update_dynamic_rev(sb);
 2278                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
 2279                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
 2280                                 sb->s_dirt = 1;
 2281                                 handle->h_sync = 1;
 2282                                 err = ext3_journal_dirty_metadata(handle,
 2283                                                 sb->u.ext3_sb.s_sbh);
 2284                         }
 2285                 }
 2286         }
 2287         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 2288         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 2289                 raw_inode->i_block[0] =
 2290                         cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
 2291         else for (block = 0; block < EXT3_N_BLOCKS; block++)
 2292                 raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
 2293 
 2294         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 2295         rc = ext3_journal_dirty_metadata(handle, bh);
 2296         if (!err)
 2297                 err = rc;
 2298         EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
 2299 
 2300 out_brelse:
 2301         brelse (bh);
 2302         ext3_std_error(inode->i_sb, err);
 2303         return err;
 2304 }
 2305 
 2306 /*
 2307  * ext3_write_inode()
 2308  *
 2309  * We are called from a few places:
 2310  *
 2311  * - Within generic_file_write() for O_SYNC files.
 2312  *   Here, there will be no transaction running. We wait for any running
 2313  *   trasnaction to commit.
 2314  *
 2315  * - Within sys_sync(), kupdate and such.
 2316  *   We wait on commit, if tol to.
 2317  *
 2318  * - Within prune_icache() (PF_MEMALLOC == true)
 2319  *   Here we simply return.  We can't afford to block kswapd on the
 2320  *   journal commit.
 2321  *
 2322  * In all cases it is actually safe for us to return without doing anything,
 2323  * because the inode has been copied into a raw inode buffer in
 2324  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
 2325  * knfsd.
 2326  *
 2327  * Note that we are absolutely dependent upon all inode dirtiers doing the
 2328  * right thing: they *must* call mark_inode_dirty() after dirtying info in
 2329  * which we are interested.
 2330  *
 2331  * It would be a bug for them to not do this.  The code:
 2332  *
 2333  *      mark_inode_dirty(inode)
 2334  *      stuff();
 2335  *      inode->i_size = expr;
 2336  *
 2337  * is in error because a kswapd-driven write_inode() could occur while
 2338  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
 2339  * will no longer be on the superblock's dirty inode list.
 2340  */
 2341 void ext3_write_inode(struct inode *inode, int wait)
 2342 {
 2343         if (current->flags & PF_MEMALLOC)
 2344                 return;
 2345 
 2346         if (ext3_journal_current_handle()) {
 2347                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
 2348                 return;
 2349         }
 2350 
 2351         if (!wait)
 2352                 return;
 2353 
 2354         ext3_force_commit(inode->i_sb); 
 2355 }
 2356 
 2357 /*
 2358  * ext3_setattr()
 2359  *
 2360  * Called from notify_change.
 2361  *
 2362  * We want to trap VFS attempts to truncate the file as soon as
 2363  * possible.  In particular, we want to make sure that when the VFS
 2364  * shrinks i_size, we put the inode on the orphan list and modify
 2365  * i_disksize immediately, so that during the subsequent flushing of
 2366  * dirty pages and freeing of disk blocks, we can guarantee that any
 2367  * commit will leave the blocks being flushed in an unused state on
 2368  * disk.  (On recovery, the inode will get truncated and the blocks will
 2369  * be freed, so we have a strong guarantee that no future commit will
 2370  * leave these blocks visible to the user.)  
 2371  *
 2372  * This is only needed for regular files.  rmdir() has its own path, and
 2373  * we can never truncate a direcory except on final unlink (at which
 2374  * point i_nlink is zero so recovery is easy.)
 2375  *
 2376  * Called with the BKL.  
 2377  */
 2378 
 2379 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 2380 {
 2381         struct inode *inode = dentry->d_inode;
 2382         int error, rc = 0;
 2383         const unsigned int ia_valid = attr->ia_valid;
 2384 
 2385         error = inode_change_ok(inode, attr);
 2386         if (error)
 2387                 return error;
 2388 
 2389         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 2390                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 2391                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 2392                 if (error)
 2393                         return error;
 2394         }
 2395 
 2396         if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
 2397                 handle_t *handle;
 2398 
 2399                 handle = ext3_journal_start(inode, 3);
 2400                 if (IS_ERR(handle)) {
 2401                         error = PTR_ERR(handle);
 2402                         goto err_out;
 2403                 }
 2404                 
 2405                 error = ext3_orphan_add(handle, inode);
 2406                 inode->u.ext3_i.i_disksize = attr->ia_size;
 2407                 rc = ext3_mark_inode_dirty(handle, inode);
 2408                 if (!error)
 2409                         error = rc;
 2410                 ext3_journal_stop(handle, inode);
 2411         }
 2412         
 2413         rc = inode_setattr(inode, attr);
 2414 
 2415         /* If inode_setattr's call to ext3_truncate failed to get a
 2416          * transaction handle at all, we need to clean up the in-core
 2417          * orphan list manually. */
 2418         if (inode->i_nlink)
 2419                 ext3_orphan_del(NULL, inode);
 2420 
 2421 err_out:
 2422         ext3_std_error(inode->i_sb, error);
 2423         if (!error)
 2424                 error = rc;
 2425         return error;
 2426 }
 2427 
 2428 
 2429 /*
 2430  * akpm: how many blocks doth make a writepage()?
 2431  *
 2432  * With N blocks per page, it may be:
 2433  * N data blocks
 2434  * 2 indirect block
 2435  * 2 dindirect
 2436  * 1 tindirect
 2437  * N+5 bitmap blocks (from the above)
 2438  * N+5 group descriptor summary blocks
 2439  * 1 inode block
 2440  * 1 superblock.
 2441  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
 2442  *
 2443  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
 2444  *
 2445  * With ordered or writeback data it's the same, less the N data blocks.
 2446  *
 2447  * If the inode's direct blocks can hold an integral number of pages then a
 2448  * page cannot straddle two indirect blocks, and we can only touch one indirect
 2449  * and dindirect block, and the "5" above becomes "3".
 2450  *
 2451  * This still overestimates under most circumstances.  If we were to pass the
 2452  * start and end offsets in here as well we could do block_to_path() on each
 2453  * block and work out the exact number of indirects which are touched.  Pah.
 2454  */
 2455 
 2456 int ext3_writepage_trans_blocks(struct inode *inode)
 2457 {
 2458         int bpp = ext3_journal_blocks_per_page(inode);
 2459         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
 2460         int ret;
 2461         
 2462         if (ext3_should_journal_data(inode))
 2463                 ret = 3 * (bpp + indirects) + 2;
 2464         else
 2465                 ret = 2 * (bpp + indirects) + 2;
 2466 
 2467 #ifdef CONFIG_QUOTA
 2468         ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
 2469 #endif
 2470 
 2471         return ret;
 2472 }
 2473 
 2474 int
 2475 ext3_mark_iloc_dirty(handle_t *handle, 
 2476                      struct inode *inode,
 2477                      struct ext3_iloc *iloc)
 2478 {
 2479         int err = 0;
 2480 
 2481         if (handle) {
 2482                 /* the do_update_inode consumes one bh->b_count */
 2483                 atomic_inc(&iloc->bh->b_count);
 2484                 err = ext3_do_update_inode(handle, inode, iloc);
 2485                 /* ext3_do_update_inode() does journal_dirty_metadata */
 2486                 brelse(iloc->bh);
 2487         } else {
 2488                 printk(KERN_EMERG "%s: called with no handle!\n", __FUNCTION__);
 2489         }
 2490         return err;
 2491 }
 2492 
 2493 /* 
 2494  * On success, We end up with an outstanding reference count against
 2495  * iloc->bh.  This _must_ be cleaned up later. 
 2496  */
 2497 
 2498 int
 2499 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
 2500                          struct ext3_iloc *iloc)
 2501 {
 2502         int err = 0;
 2503         if (handle) {
 2504                 err = ext3_get_inode_loc(inode, iloc);
 2505                 if (!err) {
 2506                         BUFFER_TRACE(iloc->bh, "get_write_access");
 2507                         err = ext3_journal_get_write_access(handle, iloc->bh);
 2508                         if (err) {
 2509                                 brelse(iloc->bh);
 2510                                 iloc->bh = NULL;
 2511                         }
 2512                 }
 2513         }
 2514         ext3_std_error(inode->i_sb, err);
 2515         return err;
 2516 }
 2517 
 2518 /*
 2519  * akpm: What we do here is to mark the in-core inode as clean
 2520  * with respect to inode dirtiness (it may still be data-dirty).
 2521  * This means that the in-core inode may be reaped by prune_icache
 2522  * without having to perform any I/O.  This is a very good thing,
 2523  * because *any* task may call prune_icache - even ones which
 2524  * have a transaction open against a different journal.
 2525  *
 2526  * Is this cheating?  Not really.  Sure, we haven't written the
 2527  * inode out, but prune_icache isn't a user-visible syncing function.
 2528  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 2529  * we start and wait on commits.
 2530  *
 2531  * Is this efficient/effective?  Well, we're being nice to the system
 2532  * by cleaning up our inodes proactively so they can be reaped
 2533  * without I/O.  But we are potentially leaving up to five seconds'
 2534  * worth of inodes floating about which prune_icache wants us to
 2535  * write out.  One way to fix that would be to get prune_icache()
 2536  * to do a write_super() to free up some memory.  It has the desired
 2537  * effect.
 2538  */
 2539 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 2540 {
 2541         struct ext3_iloc iloc;
 2542         int err;
 2543 
 2544         err = ext3_reserve_inode_write(handle, inode, &iloc);
 2545         if (!err)
 2546                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 2547         return err;
 2548 }
 2549 
 2550 /*
 2551  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
 2552  *
 2553  * We're really interested in the case where a file is being extended.
 2554  * i_size has been changed by generic_commit_write() and we thus need
 2555  * to include the updated inode in the current transaction.
 2556  *
 2557  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
 2558  * are allocated to the file.
 2559  *
 2560  * If the inode is marked synchronous, we don't honour that here - doing
 2561  * so would cause a commit on atime updates, which we don't bother doing.
 2562  * We handle synchronous inodes at the highest possible level.
 2563  */
 2564 void ext3_dirty_inode(struct inode *inode)
 2565 {
 2566         handle_t *current_handle = ext3_journal_current_handle();
 2567         handle_t *handle;
 2568 
 2569         lock_kernel();
 2570         handle = ext3_journal_start(inode, 2);
 2571         if (IS_ERR(handle))
 2572                 goto out;
 2573         if (current_handle &&
 2574                 current_handle->h_transaction != handle->h_transaction) {
 2575                 /* This task has a transaction open against a different fs */
 2576                 printk(KERN_EMERG "%s: transactions do not match!\n",
 2577                         __FUNCTION__);
 2578         } else {
 2579                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
 2580                                 current_handle);
 2581                 ext3_mark_inode_dirty(handle, inode);
 2582         }
 2583         ext3_journal_stop(handle, inode);
 2584 out:
 2585         unlock_kernel();
 2586 }
 2587 
 2588 #ifdef AKPM
 2589 /* 
 2590  * Bind an inode's backing buffer_head into this transaction, to prevent
 2591  * it from being flushed to disk early.  Unlike
 2592  * ext3_reserve_inode_write, this leaves behind no bh reference and
 2593  * returns no iloc structure, so the caller needs to repeat the iloc
 2594  * lookup to mark the inode dirty later.
 2595  */
 2596 static inline int
 2597 ext3_pin_inode(handle_t *handle, struct inode *inode)
 2598 {
 2599         struct ext3_iloc iloc;
 2600         
 2601         int err = 0;
 2602         if (handle) {
 2603                 err = ext3_get_inode_loc(inode, &iloc);
 2604                 if (!err) {
 2605                         BUFFER_TRACE(iloc.bh, "get_write_access");
 2606                         err = journal_get_write_access(handle, iloc.bh);
 2607                         if (!err)
 2608                                 err = ext3_journal_dirty_metadata(handle, 
 2609                                                                   iloc.bh);
 2610                         brelse(iloc.bh);
 2611                 }
 2612         }
 2613         ext3_std_error(inode->i_sb, err);
 2614         return err;
 2615 }
 2616 #endif
 2617 
 2618 int ext3_change_inode_journal_flag(struct inode *inode, int val)
 2619 {
 2620         journal_t *journal;
 2621         handle_t *handle;
 2622         int err;
 2623 
 2624         /*
 2625          * We have to be very careful here: changing a data block's
 2626          * journaling status dynamically is dangerous.  If we write a
 2627          * data block to the journal, change the status and then delete
 2628          * that block, we risk forgetting to revoke the old log record
 2629          * from the journal and so a subsequent replay can corrupt data.
 2630          * So, first we make sure that the journal is empty and that
 2631          * nobody is changing anything.
 2632          */
 2633 
 2634         journal = EXT3_JOURNAL(inode);
 2635         if (is_journal_aborted(journal) || IS_RDONLY(inode))
 2636                 return -EROFS;
 2637         
 2638         journal_lock_updates(journal);
 2639         journal_flush(journal);
 2640 
 2641         /*
 2642          * OK, there are no updates running now, and all cached data is
 2643          * synced to disk.  We are now in a completely consistent state
 2644          * which doesn't have anything in the journal, and we know that
 2645          * no filesystem updates are running, so it is safe to modify
 2646          * the inode's in-core data-journaling state flag now.
 2647          */
 2648 
 2649         if (val)
 2650                 inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
 2651         else
 2652                 inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
 2653 
 2654         journal_unlock_updates(journal);
 2655 
 2656         /* Finally we can mark the inode as dirty. */
 2657 
 2658         handle = ext3_journal_start(inode, 1);
 2659         if (IS_ERR(handle))
 2660                 return PTR_ERR(handle);
 2661 
 2662         err = ext3_mark_inode_dirty(handle, inode);
 2663         handle->h_sync = 1;
 2664         ext3_journal_stop(handle, inode);
 2665         ext3_std_error(inode->i_sb, err);
 2666         
 2667         return err;
 2668 }
 2669 
 2670 
 2671 /*
 2672  * ext3_aops_journal_start().
 2673  *
 2674  * <This function died, but the comment lives on>
 2675  *
 2676  * We need to take the inode semaphore *outside* the
 2677  * journal_start/journal_stop.  Otherwise, a different task could do a
 2678  * wait_for_commit() while holding ->i_sem, which deadlocks.  The rule
 2679  * is: transaction open/closes are considered to be a locking operation
 2680  * and they nest *inside* ->i_sem.
 2681  * ----------------------------------------------------------------------------
 2682  * Possible problem:
 2683  *      ext3_file_write()
 2684  *      -> generic_file_write()
 2685  *         -> __alloc_pages()
 2686  *            -> page_launder()
 2687  *               -> ext3_writepage()
 2688  *
 2689  * And the writepage can be on a different fs while we have a
 2690  * transaction open against this one!  Bad.
 2691  *
 2692  * I tried making the task PF_MEMALLOC here, but that simply results in
 2693  * 0-order allocation failures passed back to generic_file_write().
 2694  * Instead, we rely on the reentrancy protection in ext3_writepage().
 2695  * ----------------------------------------------------------------------------
 2696  * When we do the journal_start() here we don't really need to reserve
 2697  * any blocks - we won't need any until we hit ext3_prepare_write(),
 2698  * which does all the needed journal extending.  However!  There is a
 2699  * problem with quotas:
 2700  *
 2701  * Thread 1:
 2702  * sys_sync
 2703  * ->sync_dquots
 2704  *   ->commit_dquot
 2705  *     ->lock_dquot
 2706  *     ->write_dquot
 2707  *       ->ext3_file_write
 2708  *         ->journal_start
 2709  *         ->ext3_prepare_write
 2710  *           ->journal_extend
 2711  *           ->journal_start
 2712  * Thread 2:
 2713  * ext3_create          (for example)
 2714  * ->ext3_new_inode
 2715  *   ->dquot_initialize
 2716  *     ->lock_dquot
 2717  *
 2718  * Deadlock.  Thread 1's journal_start blocks because thread 2 has a
 2719  * transaction open.  Thread 2's transaction will never close because
 2720  * thread 2 is stuck waiting for the dquot lock.
 2721  *
 2722  * So.  We must ensure that thread 1 *never* needs to extend the journal
 2723  * for quota writes.  We do that by reserving enough journal blocks
 2724  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
 2725  * need to extend" test in ext3_prepare_write() succeeds.  
 2726  */
Cache object: 71c686a12cfdb7990dfa2dced730ceee
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/fs/ext3/inode.c

FreeBSD/Linux Kernel Cross Reference
sys/fs/ext3/inode.c