FreeBSD/Linux Kernel Cross Reference
sys/fs/ext3/inode.c
1 /*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25 #include <linux/fs.h>
26 #include <linux/sched.h>
27 #include <linux/ext3_jbd.h>
28 #include <linux/jbd.h>
29 #include <linux/locks.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/quotaops.h>
33 #include <linux/module.h>
34
35 /*
36 * SEARCH_FROM_ZERO forces each block allocation to search from the start
37 * of the filesystem. This is to force rapid reallocation of recently-freed
38 * blocks. The file fragmentation is horrendous.
39 */
40 #undef SEARCH_FROM_ZERO
41
42 /* The ext3 forget function must perform a revoke if we are freeing data
43 * which has been journaled. Metadata (eg. indirect blocks) must be
44 * revoked in all cases.
45 *
46 * "bh" may be NULL: a metadata block may have been freed from memory
47 * but there may still be a record of it in the journal, and that record
48 * still needs to be revoked.
49 */
50
51 static int ext3_forget(handle_t *handle, int is_metadata,
52 struct inode *inode, struct buffer_head *bh,
53 int blocknr)
54 {
55 int err;
56
57 BUFFER_TRACE(bh, "enter");
58
59 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
60 "data mode %lx\n",
61 bh, is_metadata, inode->i_mode,
62 test_opt(inode->i_sb, DATA_FLAGS));
63
64 /* Never use the revoke function if we are doing full data
65 * journaling: there is no need to, and a V1 superblock won't
66 * support it. Otherwise, only skip the revoke on un-journaled
67 * data blocks. */
68
69 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
70 (!is_metadata && !ext3_should_journal_data(inode))) {
71 if (bh) {
72 BUFFER_TRACE(bh, "call journal_forget");
73 ext3_journal_forget(handle, bh);
74 }
75 return 0;
76 }
77
78 /*
79 * data!=journal && (is_metadata || should_journal_data(inode))
80 */
81 BUFFER_TRACE(bh, "call ext3_journal_revoke");
82 err = ext3_journal_revoke(handle, blocknr, bh);
83 if (err)
84 ext3_abort(inode->i_sb, __FUNCTION__,
85 "error %d when attempting revoke", err);
86 BUFFER_TRACE(bh, "exit");
87 return err;
88 }
89
90 /*
91 * Work out how many blocks we need to progress with the next chunk of a
92 * truncate transaction.
93 */
94
95 static unsigned long blocks_for_truncate(struct inode *inode)
96 {
97 unsigned long needed;
98
99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100
101 /* Give ourselves just enough room to cope with inodes in which
102 * i_blocks is corrupt: we've seen disk corruptions in the past
103 * which resulted in random data in an inode which looked enough
104 * like a regular file for ext3 to try to delete it. Things
105 * will go a bit crazy if that happens, but at least we should
106 * try not to panic the whole kernel. */
107 if (needed < 2)
108 needed = 2;
109
110 /* But we need to bound the transaction so we don't overflow the
111 * journal. */
112 if (needed > EXT3_MAX_TRANS_DATA)
113 needed = EXT3_MAX_TRANS_DATA;
114
115 return EXT3_DATA_TRANS_BLOCKS + needed;
116 }
117
118 /*
119 * Truncate transactions can be complex and absolutely huge. So we need to
120 * be able to restart the transaction at a conventient checkpoint to make
121 * sure we don't overflow the journal.
122 *
123 * start_transaction gets us a new handle for a truncate transaction,
124 * and extend_transaction tries to extend the existing one a bit. If
125 * extend fails, we need to propagate the failure up and restart the
126 * transaction in the top-level truncate loop. --sct
127 */
128
129 static handle_t *start_transaction(struct inode *inode)
130 {
131 handle_t *result;
132
133 result = ext3_journal_start(inode, blocks_for_truncate(inode));
134 if (!IS_ERR(result))
135 return result;
136
137 ext3_std_error(inode->i_sb, PTR_ERR(result));
138 return result;
139 }
140
141 /*
142 * Try to extend this transaction for the purposes of truncation.
143 *
144 * Returns 0 if we managed to create more room. If we can't create more
145 * room, and the transaction must be restarted we return 1.
146 */
147 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
148 {
149 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
150 return 0;
151 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
152 return 0;
153 return 1;
154 }
155
156 /*
157 * Restart the transaction associated with *handle. This does a commit,
158 * so before we call here everything must be consistently dirtied against
159 * this transaction.
160 */
161 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
162 {
163 jbd_debug(2, "restarting handle %p\n", handle);
164 return ext3_journal_restart(handle, blocks_for_truncate(inode));
165 }
166
167 /*
168 * Called at each iput()
169 */
170 void ext3_put_inode (struct inode * inode)
171 {
172 ext3_discard_prealloc (inode);
173 }
174
175 /*
176 * Called at the last iput() if i_nlink is zero.
177 */
178 void ext3_delete_inode (struct inode * inode)
179 {
180 handle_t *handle;
181
182 if (is_bad_inode(inode) ||
183 inode->i_ino == EXT3_ACL_IDX_INO ||
184 inode->i_ino == EXT3_ACL_DATA_INO)
185 goto no_delete;
186
187 lock_kernel();
188 handle = start_transaction(inode);
189 if (IS_ERR(handle)) {
190 /* If we're going to skip the normal cleanup, we still
191 * need to make sure that the in-core orphan linked list
192 * is properly cleaned up. */
193 ext3_orphan_del(NULL, inode);
194
195 ext3_std_error(inode->i_sb, PTR_ERR(handle));
196 unlock_kernel();
197 goto no_delete;
198 }
199
200 if (IS_SYNC(inode))
201 handle->h_sync = 1;
202 inode->i_size = 0;
203 if (inode->i_blocks)
204 ext3_truncate(inode);
205 /*
206 * Kill off the orphan record which ext3_truncate created.
207 * AKPM: I think this can be inside the above `if'.
208 * Note that ext3_orphan_del() has to be able to cope with the
209 * deletion of a non-existent orphan - this is because we don't
210 * know if ext3_truncate() actually created an orphan record.
211 * (Well, we could do this if we need to, but heck - it works)
212 */
213 ext3_orphan_del(handle, inode);
214 inode->u.ext3_i.i_dtime = CURRENT_TIME;
215
216 /*
217 * One subtle ordering requirement: if anything has gone wrong
218 * (transaction abort, IO errors, whatever), then we can still
219 * do these next steps (the fs will already have been marked as
220 * having errors), but we can't free the inode if the mark_dirty
221 * fails.
222 */
223 if (ext3_mark_inode_dirty(handle, inode))
224 /* If that failed, just do the required in-core inode clear. */
225 clear_inode(inode);
226 else
227 ext3_free_inode(handle, inode);
228 ext3_journal_stop(handle, inode);
229 unlock_kernel();
230 return;
231 no_delete:
232 clear_inode(inode); /* We must guarantee clearing of inode... */
233 }
234
235 void ext3_discard_prealloc (struct inode * inode)
236 {
237 #ifdef EXT3_PREALLOCATE
238 lock_kernel();
239 /* Writer: ->i_prealloc* */
240 if (inode->u.ext3_i.i_prealloc_count) {
241 unsigned short total = inode->u.ext3_i.i_prealloc_count;
242 unsigned long block = inode->u.ext3_i.i_prealloc_block;
243 inode->u.ext3_i.i_prealloc_count = 0;
244 inode->u.ext3_i.i_prealloc_block = 0;
245 /* Writer: end */
246 ext3_free_blocks (inode, block, total);
247 }
248 unlock_kernel();
249 #endif
250 }
251
252 static int ext3_alloc_block (handle_t *handle,
253 struct inode * inode, unsigned long goal, int *err)
254 {
255 #ifdef EXT3FS_DEBUG
256 static unsigned long alloc_hits = 0, alloc_attempts = 0;
257 #endif
258 unsigned long result;
259
260 #ifdef EXT3_PREALLOCATE
261 /* Writer: ->i_prealloc* */
262 if (inode->u.ext3_i.i_prealloc_count &&
263 (goal == inode->u.ext3_i.i_prealloc_block ||
264 goal + 1 == inode->u.ext3_i.i_prealloc_block))
265 {
266 result = inode->u.ext3_i.i_prealloc_block++;
267 inode->u.ext3_i.i_prealloc_count--;
268 /* Writer: end */
269 ext3_debug ("preallocation hit (%lu/%lu).\n",
270 ++alloc_hits, ++alloc_attempts);
271 } else {
272 ext3_discard_prealloc (inode);
273 ext3_debug ("preallocation miss (%lu/%lu).\n",
274 alloc_hits, ++alloc_attempts);
275 if (S_ISREG(inode->i_mode))
276 result = ext3_new_block (inode, goal,
277 &inode->u.ext3_i.i_prealloc_count,
278 &inode->u.ext3_i.i_prealloc_block, err);
279 else
280 result = ext3_new_block (inode, goal, 0, 0, err);
281 /*
282 * AKPM: this is somewhat sticky. I'm not surprised it was
283 * disabled in 2.2's ext3. Need to integrate b_committed_data
284 * guarding with preallocation, if indeed preallocation is
285 * effective.
286 */
287 }
288 #else
289 result = ext3_new_block (handle, inode, goal, 0, 0, err);
290 #endif
291 return result;
292 }
293
294
295 typedef struct {
296 u32 *p;
297 u32 key;
298 struct buffer_head *bh;
299 } Indirect;
300
301 static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
302 {
303 p->key = *(p->p = v);
304 p->bh = bh;
305 }
306
307 static inline int verify_chain(Indirect *from, Indirect *to)
308 {
309 while (from <= to && from->key == *from->p)
310 from++;
311 return (from > to);
312 }
313
314 /**
315 * ext3_block_to_path - parse the block number into array of offsets
316 * @inode: inode in question (we are only interested in its superblock)
317 * @i_block: block number to be parsed
318 * @offsets: array to store the offsets in
319 *
320 * To store the locations of file's data ext3 uses a data structure common
321 * for UNIX filesystems - tree of pointers anchored in the inode, with
322 * data blocks at leaves and indirect blocks in intermediate nodes.
323 * This function translates the block number into path in that tree -
324 * return value is the path length and @offsets[n] is the offset of
325 * pointer to (n+1)th node in the nth one. If @block is out of range
326 * (negative or too large) warning is printed and zero returned.
327 *
328 * Note: function doesn't find node addresses, so no IO is needed. All
329 * we need to know is the capacity of indirect blocks (taken from the
330 * inode->i_sb).
331 */
332
333 /*
334 * Portability note: the last comparison (check that we fit into triple
335 * indirect block) is spelled differently, because otherwise on an
336 * architecture with 32-bit longs and 8Kb pages we might get into trouble
337 * if our filesystem had 8Kb blocks. We might use long long, but that would
338 * kill us on x86. Oh, well, at least the sign propagation does not matter -
339 * i_block would have to be negative in the very beginning, so we would not
340 * get there at all.
341 */
342
343 static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
344 {
345 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
346 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
347 const long direct_blocks = EXT3_NDIR_BLOCKS,
348 indirect_blocks = ptrs,
349 double_blocks = (1 << (ptrs_bits * 2));
350 int n = 0;
351
352 if (i_block < 0) {
353 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
354 } else if (i_block < direct_blocks) {
355 offsets[n++] = i_block;
356 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
357 offsets[n++] = EXT3_IND_BLOCK;
358 offsets[n++] = i_block;
359 } else if ((i_block -= indirect_blocks) < double_blocks) {
360 offsets[n++] = EXT3_DIND_BLOCK;
361 offsets[n++] = i_block >> ptrs_bits;
362 offsets[n++] = i_block & (ptrs - 1);
363 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
364 offsets[n++] = EXT3_TIND_BLOCK;
365 offsets[n++] = i_block >> (ptrs_bits * 2);
366 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
367 offsets[n++] = i_block & (ptrs - 1);
368 } else {
369 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
370 }
371 return n;
372 }
373
374 /**
375 * ext3_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question
377 * @depth: depth of the chain (1 - direct pointer, etc.)
378 * @offsets: offsets of pointers in inode/indirect blocks
379 * @chain: place to store the result
380 * @err: here we store the error value
381 *
382 * Function fills the array of triples <key, p, bh> and returns %NULL
383 * if everything went OK or the pointer to the last filled triple
384 * (incomplete one) otherwise. Upon the return chain[i].key contains
385 * the number of (i+1)-th block in the chain (as it is stored in memory,
386 * i.e. little-endian 32-bit), chain[i].p contains the address of that
387 * number (it points into struct inode for i==0 and into the bh->b_data
388 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
389 * block for i>0 and NULL for i==0. In other words, it holds the block
390 * numbers of the chain, addresses they were taken from (and where we can
391 * verify that chain did not change) and buffer_heads hosting these
392 * numbers.
393 *
394 * Function stops when it stumbles upon zero pointer (absent block)
395 * (pointer to last triple returned, *@err == 0)
396 * or when it gets an IO error reading an indirect block
397 * (ditto, *@err == -EIO)
398 * or when it notices that chain had been changed while it was reading
399 * (ditto, *@err == -EAGAIN)
400 * or when it reads all @depth-1 indirect blocks successfully and finds
401 * the whole chain, all way to the data (returns %NULL, *err == 0).
402 */
403 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
404 Indirect chain[4], int *err)
405 {
406 struct super_block *sb = inode->i_sb;
407 Indirect *p = chain;
408 struct buffer_head *bh;
409
410 *err = 0;
411 /* i_data is not going away, no lock needed */
412 add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
413 if (!p->key)
414 goto no_block;
415 while (--depth) {
416 bh = sb_bread(sb, le32_to_cpu(p->key));
417 if (!bh)
418 goto failure;
419 /* Reader: pointers */
420 if (!verify_chain(chain, p))
421 goto changed;
422 add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
423 /* Reader: end */
424 if (!p->key)
425 goto no_block;
426 }
427 return NULL;
428
429 changed:
430 brelse(bh);
431 *err = -EAGAIN;
432 goto no_block;
433 failure:
434 *err = -EIO;
435 no_block:
436 return p;
437 }
438
439 /**
440 * ext3_find_near - find a place for allocation with sufficient locality
441 * @inode: owner
442 * @ind: descriptor of indirect block.
443 *
444 * This function returns the prefered place for block allocation.
445 * It is used when heuristic for sequential allocation fails.
446 * Rules are:
447 * + if there is a block to the left of our position - allocate near it.
448 * + if pointer will live in indirect block - allocate near that block.
449 * + if pointer will live in inode - allocate in the same
450 * cylinder group.
451 * Caller must make sure that @ind is valid and will stay that way.
452 */
453
454 static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
455 {
456 u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
457 u32 *p;
458
459 /* Try to find previous block */
460 for (p = ind->p - 1; p >= start; p--)
461 if (*p)
462 return le32_to_cpu(*p);
463
464 /* No such thing, so let's try location of indirect block */
465 if (ind->bh)
466 return ind->bh->b_blocknr;
467
468 /*
469 * It is going to be refered from inode itself? OK, just put it into
470 * the same cylinder group then.
471 */
472 return (inode->u.ext3_i.i_block_group *
473 EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
474 le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
475 }
476
477 /**
478 * ext3_find_goal - find a prefered place for allocation.
479 * @inode: owner
480 * @block: block we want
481 * @chain: chain of indirect blocks
482 * @partial: pointer to the last triple within a chain
483 * @goal: place to store the result.
484 *
485 * Normally this function find the prefered place for block allocation,
486 * stores it in *@goal and returns zero. If the branch had been changed
487 * under us we return -EAGAIN.
488 */
489
490 static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
491 Indirect *partial, unsigned long *goal)
492 {
493 /* Writer: ->i_next_alloc* */
494 if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
495 inode->u.ext3_i.i_next_alloc_block++;
496 inode->u.ext3_i.i_next_alloc_goal++;
497 }
498 #ifdef SEARCH_FROM_ZERO
499 inode->u.ext3_i.i_next_alloc_block = 0;
500 inode->u.ext3_i.i_next_alloc_goal = 0;
501 #endif
502 /* Writer: end */
503 /* Reader: pointers, ->i_next_alloc* */
504 if (verify_chain(chain, partial)) {
505 /*
506 * try the heuristic for sequential allocation,
507 * failing that at least try to get decent locality.
508 */
509 if (block == inode->u.ext3_i.i_next_alloc_block)
510 *goal = inode->u.ext3_i.i_next_alloc_goal;
511 if (!*goal)
512 *goal = ext3_find_near(inode, partial);
513 #ifdef SEARCH_FROM_ZERO
514 *goal = 0;
515 #endif
516 return 0;
517 }
518 /* Reader: end */
519 return -EAGAIN;
520 }
521
522 /**
523 * ext3_alloc_branch - allocate and set up a chain of blocks.
524 * @inode: owner
525 * @num: depth of the chain (number of blocks to allocate)
526 * @offsets: offsets (in the blocks) to store the pointers to next.
527 * @branch: place to store the chain in.
528 *
529 * This function allocates @num blocks, zeroes out all but the last one,
530 * links them into chain and (if we are synchronous) writes them to disk.
531 * In other words, it prepares a branch that can be spliced onto the
532 * inode. It stores the information about that chain in the branch[], in
533 * the same format as ext3_get_branch() would do. We are calling it after
534 * we had read the existing part of chain and partial points to the last
535 * triple of that (one with zero ->key). Upon the exit we have the same
536 * picture as after the successful ext3_get_block(), excpet that in one
537 * place chain is disconnected - *branch->p is still zero (we did not
538 * set the last link), but branch->key contains the number that should
539 * be placed into *branch->p to fill that gap.
540 *
541 * If allocation fails we free all blocks we've allocated (and forget
542 * their buffer_heads) and return the error value the from failed
543 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
544 * as described above and return 0.
545 */
546
547 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
548 int num,
549 unsigned long goal,
550 int *offsets,
551 Indirect *branch)
552 {
553 int blocksize = inode->i_sb->s_blocksize;
554 int n = 0, keys = 0;
555 int err = 0;
556 int i;
557 int parent = ext3_alloc_block(handle, inode, goal, &err);
558
559 branch[0].key = cpu_to_le32(parent);
560 if (parent) {
561 for (n = 1; n < num; n++) {
562 struct buffer_head *bh;
563 /* Allocate the next block */
564 int nr = ext3_alloc_block(handle, inode, parent, &err);
565 if (!nr)
566 break;
567 branch[n].key = cpu_to_le32(nr);
568 keys = n+1;
569
570 /*
571 * Get buffer_head for parent block, zero it out
572 * and set the pointer to new one, then send
573 * parent to disk.
574 */
575 bh = sb_getblk(inode->i_sb, parent);
576 branch[n].bh = bh;
577 lock_buffer(bh);
578 BUFFER_TRACE(bh, "call get_create_access");
579 err = ext3_journal_get_create_access(handle, bh);
580 if (err) {
581 unlock_buffer(bh);
582 brelse(bh);
583 break;
584 }
585
586 memset(bh->b_data, 0, blocksize);
587 branch[n].p = (u32*) bh->b_data + offsets[n];
588 *branch[n].p = branch[n].key;
589 BUFFER_TRACE(bh, "marking uptodate");
590 mark_buffer_uptodate(bh, 1);
591 unlock_buffer(bh);
592
593 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
594 err = ext3_journal_dirty_metadata(handle, bh);
595 if (err)
596 break;
597
598 parent = nr;
599 }
600 }
601 if (n == num)
602 return 0;
603
604 /* Allocation failed, free what we already allocated */
605 for (i = 1; i < keys; i++) {
606 BUFFER_TRACE(branch[i].bh, "call journal_forget");
607 ext3_journal_forget(handle, branch[i].bh);
608 }
609 for (i = 0; i < keys; i++)
610 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
611 return err;
612 }
613
614 /**
615 * ext3_splice_branch - splice the allocated branch onto inode.
616 * @inode: owner
617 * @block: (logical) number of block we are adding
618 * @chain: chain of indirect blocks (with a missing link - see
619 * ext3_alloc_branch)
620 * @where: location of missing link
621 * @num: number of blocks we are adding
622 *
623 * This function verifies that chain (up to the missing link) had not
624 * changed, fills the missing link and does all housekeeping needed in
625 * inode (->i_blocks, etc.). In case of success we end up with the full
626 * chain to new block and return 0. Otherwise (== chain had been changed)
627 * we free the new blocks (forgetting their buffer_heads, indeed) and
628 * return -EAGAIN.
629 */
630
631 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
632 Indirect chain[4], Indirect *where, int num)
633 {
634 int i;
635 int err = 0;
636
637 /*
638 * If we're splicing into a [td]indirect block (as opposed to the
639 * inode) then we need to get write access to the [td]indirect block
640 * before the splice.
641 */
642 if (where->bh) {
643 BUFFER_TRACE(where->bh, "get_write_access");
644 err = ext3_journal_get_write_access(handle, where->bh);
645 if (err)
646 goto err_out;
647 }
648 /* Verify that place we are splicing to is still there and vacant */
649
650 /* Writer: pointers, ->i_next_alloc* */
651 if (!verify_chain(chain, where-1) || *where->p)
652 /* Writer: end */
653 goto changed;
654
655 /* That's it */
656
657 *where->p = where->key;
658 inode->u.ext3_i.i_next_alloc_block = block;
659 inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
660 #ifdef SEARCH_FROM_ZERO
661 inode->u.ext3_i.i_next_alloc_block = 0;
662 inode->u.ext3_i.i_next_alloc_goal = 0;
663 #endif
664 /* Writer: end */
665
666 /* We are done with atomic stuff, now do the rest of housekeeping */
667
668 inode->i_ctime = CURRENT_TIME;
669 ext3_mark_inode_dirty(handle, inode);
670
671 /* had we spliced it onto indirect block? */
672 if (where->bh) {
673 /*
674 * akpm: If we spliced it onto an indirect block, we haven't
675 * altered the inode. Note however that if it is being spliced
676 * onto an indirect block at the very end of the file (the
677 * file is growing) then we *will* alter the inode to reflect
678 * the new i_size. But that is not done here - it is done in
679 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
680 */
681 jbd_debug(5, "splicing indirect only\n");
682 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
683 err = ext3_journal_dirty_metadata(handle, where->bh);
684 if (err)
685 goto err_out;
686 } else {
687 /*
688 * OK, we spliced it into the inode itself on a direct block.
689 * Inode was dirtied above.
690 */
691 jbd_debug(5, "splicing direct\n");
692 }
693 return err;
694
695 changed:
696 /*
697 * AKPM: if where[i].bh isn't part of the current updating
698 * transaction then we explode nastily. Test this code path.
699 */
700 jbd_debug(1, "the chain changed: try again\n");
701 err = -EAGAIN;
702
703 err_out:
704 for (i = 1; i < num; i++) {
705 BUFFER_TRACE(where[i].bh, "call journal_forget");
706 ext3_journal_forget(handle, where[i].bh);
707 }
708 /* For the normal collision cleanup case, we free up the blocks.
709 * On genuine filesystem errors we don't even think about doing
710 * that. */
711 if (err == -EAGAIN)
712 for (i = 0; i < num; i++)
713 ext3_free_blocks(handle, inode,
714 le32_to_cpu(where[i].key), 1);
715 return err;
716 }
717
718 /*
719 * Allocation strategy is simple: if we have to allocate something, we will
720 * have to go the whole way to leaf. So let's do it before attaching anything
721 * to tree, set linkage between the newborn blocks, write them if sync is
722 * required, recheck the path, free and repeat if check fails, otherwise
723 * set the last missing link (that will protect us from any truncate-generated
724 * removals - all blocks on the path are immune now) and possibly force the
725 * write on the parent block.
726 * That has a nice additional property: no special recovery from the failed
727 * allocations is needed - we simply release blocks and do not touch anything
728 * reachable from inode.
729 *
730 * akpm: `handle' can be NULL if create == 0.
731 *
732 * The BKL may not be held on entry here. Be sure to take it early.
733 */
734
735 static int ext3_get_block_handle(handle_t *handle, struct inode *inode,
736 long iblock,
737 struct buffer_head *bh_result, int create)
738 {
739 int err = -EIO;
740 int offsets[4];
741 Indirect chain[4];
742 Indirect *partial;
743 unsigned long goal;
744 int left;
745 int depth = ext3_block_to_path(inode, iblock, offsets);
746 loff_t new_size;
747
748 J_ASSERT(handle != NULL || create == 0);
749
750 if (depth == 0)
751 goto out;
752
753 lock_kernel();
754 reread:
755 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
756
757 /* Simplest case - block found, no allocation needed */
758 if (!partial) {
759 bh_result->b_state &= ~(1UL << BH_New);
760 got_it:
761 bh_result->b_dev = inode->i_dev;
762 bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
763 bh_result->b_state |= (1UL << BH_Mapped);
764 /* Clean up and exit */
765 partial = chain+depth-1; /* the whole chain */
766 goto cleanup;
767 }
768
769 /* Next simple case - plain lookup or failed read of indirect block */
770 if (!create || err == -EIO) {
771 cleanup:
772 while (partial > chain) {
773 BUFFER_TRACE(partial->bh, "call brelse");
774 brelse(partial->bh);
775 partial--;
776 }
777 BUFFER_TRACE(bh_result, "returned");
778 unlock_kernel();
779 out:
780 return err;
781 }
782
783 /*
784 * Indirect block might be removed by truncate while we were
785 * reading it. Handling of that case (forget what we've got and
786 * reread) is taken out of the main path.
787 */
788 if (err == -EAGAIN)
789 goto changed;
790
791 if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
792 goto changed;
793
794 left = (chain + depth) - partial;
795
796 /*
797 * Block out ext3_truncate while we alter the tree
798 */
799 down_read(&inode->u.ext3_i.truncate_sem);
800 err = ext3_alloc_branch(handle, inode, left, goal,
801 offsets+(partial-chain), partial);
802
803 /* The ext3_splice_branch call will free and forget any buffers
804 * on the new chain if there is a failure, but that risks using
805 * up transaction credits, especially for bitmaps where the
806 * credits cannot be returned. Can we handle this somehow? We
807 * may need to return -EAGAIN upwards in the worst case. --sct */
808 if (!err)
809 err = ext3_splice_branch(handle, inode, iblock, chain,
810 partial, left);
811 up_read(&inode->u.ext3_i.truncate_sem);
812 if (err == -EAGAIN)
813 goto changed;
814 if (err)
815 goto cleanup;
816
817 new_size = inode->i_size;
818 /*
819 * This is not racy against ext3_truncate's modification of i_disksize
820 * because VM/VFS ensures that the file cannot be extended while
821 * truncate is in progress. It is racy between multiple parallel
822 * instances of get_block, but we have the BKL.
823 */
824 if (new_size > inode->u.ext3_i.i_disksize)
825 inode->u.ext3_i.i_disksize = new_size;
826
827 bh_result->b_state |= (1UL << BH_New);
828 goto got_it;
829
830 changed:
831 while (partial > chain) {
832 jbd_debug(1, "buffer chain changed, retrying\n");
833 BUFFER_TRACE(partial->bh, "brelsing");
834 brelse(partial->bh);
835 partial--;
836 }
837 goto reread;
838 }
839
840 /*
841 * The BKL is not held on entry here.
842 */
843 static int ext3_get_block(struct inode *inode, long iblock,
844 struct buffer_head *bh_result, int create)
845 {
846 handle_t *handle = 0;
847 int ret;
848
849 if (create) {
850 handle = ext3_journal_current_handle();
851 J_ASSERT(handle != 0);
852 }
853 ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
854 return ret;
855 }
856
857 /*
858 * `handle' can be NULL if create is zero
859 */
860 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
861 long block, int create, int * errp)
862 {
863 struct buffer_head dummy;
864 int fatal = 0, err;
865
866 J_ASSERT(handle != NULL || create == 0);
867
868 dummy.b_state = 0;
869 dummy.b_blocknr = -1000;
870 buffer_trace_init(&dummy.b_history);
871 *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
872 if (!*errp && buffer_mapped(&dummy)) {
873 struct buffer_head *bh;
874 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
875 if (buffer_new(&dummy)) {
876 J_ASSERT(create != 0);
877 J_ASSERT(handle != 0);
878
879 /* Now that we do not always journal data, we
880 should keep in mind whether this should
881 always journal the new buffer as metadata.
882 For now, regular file writes use
883 ext3_get_block instead, so it's not a
884 problem. */
885 lock_kernel();
886 lock_buffer(bh);
887 BUFFER_TRACE(bh, "call get_create_access");
888 fatal = ext3_journal_get_create_access(handle, bh);
889 if (!fatal) {
890 memset(bh->b_data, 0,
891 inode->i_sb->s_blocksize);
892 mark_buffer_uptodate(bh, 1);
893 }
894 unlock_buffer(bh);
895 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
896 err = ext3_journal_dirty_metadata(handle, bh);
897 if (!fatal) fatal = err;
898 unlock_kernel();
899 } else {
900 BUFFER_TRACE(bh, "not a new buffer");
901 }
902 if (fatal) {
903 *errp = fatal;
904 brelse(bh);
905 bh = NULL;
906 }
907 return bh;
908 }
909 return NULL;
910 }
911
912 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
913 int block, int create, int *err)
914 {
915 struct buffer_head * bh;
916 int prev_blocks;
917
918 prev_blocks = inode->i_blocks;
919
920 bh = ext3_getblk (handle, inode, block, create, err);
921 if (!bh)
922 return bh;
923 #ifdef EXT3_PREALLOCATE
924 /*
925 * If the inode has grown, and this is a directory, then use a few
926 * more of the preallocated blocks to keep directory fragmentation
927 * down. The preallocated blocks are guaranteed to be contiguous.
928 */
929 if (create &&
930 S_ISDIR(inode->i_mode) &&
931 inode->i_blocks > prev_blocks &&
932 EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
933 EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
934 int i;
935 struct buffer_head *tmp_bh;
936
937 for (i = 1;
938 inode->u.ext3_i.i_prealloc_count &&
939 i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
940 i++) {
941 /*
942 * ext3_getblk will zero out the contents of the
943 * directory for us
944 */
945 tmp_bh = ext3_getblk(handle, inode,
946 block+i, create, err);
947 if (!tmp_bh) {
948 brelse (bh);
949 return 0;
950 }
951 brelse (tmp_bh);
952 }
953 }
954 #endif
955 if (buffer_uptodate(bh))
956 return bh;
957 ll_rw_block (READ, 1, &bh);
958 wait_on_buffer (bh);
959 if (buffer_uptodate(bh))
960 return bh;
961 brelse (bh);
962 *err = -EIO;
963 return NULL;
964 }
965
966 static int walk_page_buffers( handle_t *handle,
967 struct inode *inode,
968 struct buffer_head *head,
969 unsigned from,
970 unsigned to,
971 int *partial,
972 int (*fn)( handle_t *handle,
973 struct inode *inode,
974 struct buffer_head *bh))
975 {
976 struct buffer_head *bh;
977 unsigned block_start, block_end;
978 unsigned blocksize = head->b_size;
979 int err, ret = 0;
980
981 for ( bh = head, block_start = 0;
982 ret == 0 && (bh != head || !block_start);
983 block_start = block_end, bh = bh->b_this_page)
984 {
985 block_end = block_start + blocksize;
986 if (block_end <= from || block_start >= to) {
987 if (partial && !buffer_uptodate(bh))
988 *partial = 1;
989 continue;
990 }
991 err = (*fn)(handle, inode, bh);
992 if (!ret)
993 ret = err;
994 }
995 return ret;
996 }
997
998 /*
999 * To preserve ordering, it is essential that the hole instantiation and
1000 * the data write be encapsulated in a single transaction. We cannot
1001 * close off a transaction and start a new one between the ext3_get_block()
1002 * and the commit_write(). So doing the journal_start at the start of
1003 * prepare_write() is the right place.
1004 *
1005 * Also, this function can nest inside ext3_writepage() ->
1006 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1007 * has generated enough buffer credits to do the whole page. So we won't
1008 * block on the journal in that case, which is good, because the caller may
1009 * be PF_MEMALLOC.
1010 *
1011 * By accident, ext3 can be reentered when a transaction is open via
1012 * quota file writes. If we were to commit the transaction while thus
1013 * reentered, there can be a deadlock - we would be holding a quota
1014 * lock, and the commit would never complete if another thread had a
1015 * transaction open and was blocking on the quota lock - a ranking
1016 * violation.
1017 *
1018 * So what we do is to rely on the fact that journal_stop/journal_start
1019 * will _not_ run commit under these circumstances because handle->h_ref
1020 * is elevated. We'll still have enough credits for the tiny quotafile
1021 * write.
1022 */
1023
1024 static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
1025 struct buffer_head *bh)
1026 {
1027 return ext3_journal_get_write_access(handle, bh);
1028 }
1029
1030 static int ext3_prepare_write(struct file *file, struct page *page,
1031 unsigned from, unsigned to)
1032 {
1033 struct inode *inode = page->mapping->host;
1034 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1035 handle_t *handle;
1036
1037 lock_kernel();
1038 handle = ext3_journal_start(inode, needed_blocks);
1039 if (IS_ERR(handle)) {
1040 ret = PTR_ERR(handle);
1041 goto out;
1042 }
1043 unlock_kernel();
1044 ret = block_prepare_write(page, from, to, ext3_get_block);
1045 lock_kernel();
1046 if (ret != 0)
1047 goto prepare_write_failed;
1048
1049 if (ext3_should_journal_data(inode)) {
1050 ret = walk_page_buffers(handle, inode, page->buffers,
1051 from, to, NULL, do_journal_get_write_access);
1052 if (ret) {
1053 /*
1054 * We're going to fail this prepare_write(),
1055 * so commit_write() will not be called.
1056 * We need to undo block_prepare_write()'s kmap().
1057 * AKPM: Do we need to clear PageUptodate? I don't
1058 * think so.
1059 */
1060 kunmap(page);
1061 }
1062 }
1063 prepare_write_failed:
1064 if (ret)
1065 ext3_journal_stop(handle, inode);
1066 out:
1067 unlock_kernel();
1068 return ret;
1069 }
1070
1071 static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
1072 struct buffer_head *bh)
1073 {
1074 int ret = ext3_journal_dirty_data(handle, bh, 0);
1075 buffer_insert_inode_data_queue(bh, inode);
1076 return ret;
1077 }
1078
1079 /*
1080 * For ext3_writepage(). We also brelse() the buffer to account for
1081 * the bget() which ext3_writepage() performs.
1082 */
1083 static int journal_dirty_async_data(handle_t *handle, struct inode *inode,
1084 struct buffer_head *bh)
1085 {
1086 int ret = ext3_journal_dirty_data(handle, bh, 1);
1087 buffer_insert_inode_data_queue(bh, inode);
1088 __brelse(bh);
1089 return ret;
1090 }
1091
1092 /* For commit_write() in data=journal mode */
1093 static int commit_write_fn(handle_t *handle, struct inode *inode,
1094 struct buffer_head *bh)
1095 {
1096 set_bit(BH_Uptodate, &bh->b_state);
1097 return ext3_journal_dirty_metadata(handle, bh);
1098 }
1099
1100 /*
1101 * We need to pick up the new inode size which generic_commit_write gave us
1102 * `file' can be NULL - eg, when called from block_symlink().
1103 *
1104 * ext3 inode->i_dirty_buffers policy: If we're journalling data we
1105 * definitely don't want them to appear on the inode at all - instead
1106 * we need to manage them at the JBD layer and we need to intercept
1107 * the relevant sync operations and translate them into journal operations.
1108 *
1109 * If we're not journalling data then we can just leave the buffers
1110 * on ->i_dirty_buffers. If someone writes them out for us then thanks.
1111 * Otherwise we'll do it in commit, if we're using ordered data.
1112 */
1113
1114 static int ext3_commit_write(struct file *file, struct page *page,
1115 unsigned from, unsigned to)
1116 {
1117 handle_t *handle = ext3_journal_current_handle();
1118 struct inode *inode = page->mapping->host;
1119 int ret = 0, ret2;
1120
1121 lock_kernel();
1122 if (ext3_should_journal_data(inode)) {
1123 /*
1124 * Here we duplicate the generic_commit_write() functionality
1125 */
1126 int partial = 0;
1127 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1128
1129 ret = walk_page_buffers(handle, inode, page->buffers,
1130 from, to, &partial, commit_write_fn);
1131 if (!partial)
1132 SetPageUptodate(page);
1133 kunmap(page);
1134 if (pos > inode->i_size)
1135 inode->i_size = pos;
1136 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1137 } else {
1138 if (ext3_should_order_data(inode)) {
1139 ret = walk_page_buffers(handle, inode, page->buffers,
1140 from, to, NULL, journal_dirty_sync_data);
1141 }
1142 /* Be careful here if generic_commit_write becomes a
1143 * required invocation after block_prepare_write. */
1144 if (ret == 0) {
1145 ret = generic_commit_write(file, page, from, to);
1146 } else {
1147 /*
1148 * block_prepare_write() was called, but we're not
1149 * going to call generic_commit_write(). So we
1150 * need to perform generic_commit_write()'s kunmap
1151 * by hand.
1152 */
1153 kunmap(page);
1154 }
1155 }
1156 if (inode->i_size > inode->u.ext3_i.i_disksize) {
1157 inode->u.ext3_i.i_disksize = inode->i_size;
1158 ret2 = ext3_mark_inode_dirty(handle, inode);
1159 if (!ret)
1160 ret = ret2;
1161 }
1162 ret2 = ext3_journal_stop(handle, inode);
1163 unlock_kernel();
1164 if (!ret)
1165 ret = ret2;
1166 return ret;
1167 }
1168
1169 /*
1170 * bmap() is special. It gets used by applications such as lilo and by
1171 * the swapper to find the on-disk block of a specific piece of data.
1172 *
1173 * Naturally, this is dangerous if the block concerned is still in the
1174 * journal. If somebody makes a swapfile on an ext3 data-journaling
1175 * filesystem and enables swap, then they may get a nasty shock when the
1176 * data getting swapped to that swapfile suddenly gets overwritten by
1177 * the original zero's written out previously to the journal and
1178 * awaiting writeback in the kernel's buffer cache.
1179 *
1180 * So, if we see any bmap calls here on a modified, data-journaled file,
1181 * take extra steps to flush any blocks which might be in the cache.
1182 */
1183 static int ext3_bmap(struct address_space *mapping, long block)
1184 {
1185 struct inode *inode = mapping->host;
1186 journal_t *journal;
1187 int err;
1188
1189 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1190 /*
1191 * This is a REALLY heavyweight approach, but the use of
1192 * bmap on dirty files is expected to be extremely rare:
1193 * only if we run lilo or swapon on a freshly made file
1194 * do we expect this to happen.
1195 *
1196 * (bmap requires CAP_SYS_RAWIO so this does not
1197 * represent an unprivileged user DOS attack --- we'd be
1198 * in trouble if mortal users could trigger this path at
1199 * will.)
1200 *
1201 * NB. EXT3_STATE_JDATA is not set on files other than
1202 * regular files. If somebody wants to bmap a directory
1203 * or symlink and gets confused because the buffer
1204 * hasn't yet been flushed to disk, they deserve
1205 * everything they get.
1206 */
1207
1208 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1209 journal = EXT3_JOURNAL(inode);
1210 journal_lock_updates(journal);
1211 err = journal_flush(journal);
1212 journal_unlock_updates(journal);
1213
1214 if (err)
1215 return 0;
1216 }
1217
1218 return generic_block_bmap(mapping,block,ext3_get_block);
1219 }
1220
1221 static int bget_one(handle_t *handle, struct inode *inode,
1222 struct buffer_head *bh)
1223 {
1224 atomic_inc(&bh->b_count);
1225 return 0;
1226 }
1227
1228 /*
1229 * Note that we always start a transaction even if we're not journalling
1230 * data. This is to preserve ordering: any hole instantiation within
1231 * __block_write_full_page -> ext3_get_block() should be journalled
1232 * along with the data so we don't crash and then get metadata which
1233 * refers to old data.
1234 *
1235 * In all journalling modes block_write_full_page() will start the I/O.
1236 *
1237 * Problem:
1238 *
1239 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1240 * ext3_writepage()
1241 *
1242 * Similar for:
1243 *
1244 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1245 *
1246 * Same applies to ext3_get_block(). We will deadlock on various things like
1247 * lock_journal and i_truncate_sem.
1248 *
1249 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1250 * allocations fail.
1251 *
1252 * 16May01: If we're reentered then journal_current_handle() will be
1253 * non-zero. We simply *return*.
1254 *
1255 * 1 July 2001: @@@ FIXME:
1256 * In journalled data mode, a data buffer may be metadata against the
1257 * current transaction. But the same file is part of a shared mapping
1258 * and someone does a writepage() on it.
1259 *
1260 * We will move the buffer onto the async_data list, but *after* it has
1261 * been dirtied. So there's a small window where we have dirty data on
1262 * BJ_Metadata.
1263 *
1264 * Note that this only applies to the last partial page in the file. The
1265 * bit which block_write_full_page() uses prepare/commit for. (That's
1266 * broken code anyway: it's wrong for msync()).
1267 *
1268 * It's a rare case: affects the final partial page, for journalled data
1269 * where the file is subject to bith write() and writepage() in the same
1270 * transction. To fix it we'll need a custom block_write_full_page().
1271 * We'll probably need that anyway for journalling writepage() output.
1272 *
1273 * We don't honour synchronous mounts for writepage(). That would be
1274 * disastrous. Any write() or metadata operation will sync the fs for
1275 * us.
1276 */
1277 static int ext3_writepage(struct page *page)
1278 {
1279 struct inode *inode = page->mapping->host;
1280 struct buffer_head *page_buffers;
1281 handle_t *handle = NULL;
1282 int ret = 0, err;
1283 int needed;
1284 int order_data;
1285
1286 J_ASSERT(PageLocked(page));
1287
1288 /*
1289 * We give up here if we're reentered, because it might be
1290 * for a different filesystem. One *could* look for a
1291 * nested transaction opportunity.
1292 */
1293 lock_kernel();
1294 if (ext3_journal_current_handle())
1295 goto out_fail;
1296
1297 needed = ext3_writepage_trans_blocks(inode);
1298 if (current->flags & PF_MEMALLOC)
1299 handle = ext3_journal_try_start(inode, needed);
1300 else
1301 handle = ext3_journal_start(inode, needed);
1302
1303 if (IS_ERR(handle)) {
1304 ret = PTR_ERR(handle);
1305 goto out_fail;
1306 }
1307
1308 order_data = ext3_should_order_data(inode) ||
1309 ext3_should_journal_data(inode);
1310
1311 unlock_kernel();
1312
1313 page_buffers = NULL; /* Purely to prevent compiler warning */
1314
1315 /* bget() all the buffers */
1316 if (order_data) {
1317 if (!page->buffers)
1318 create_empty_buffers(page,
1319 inode->i_dev, inode->i_sb->s_blocksize);
1320 page_buffers = page->buffers;
1321 walk_page_buffers(handle, inode, page_buffers, 0,
1322 PAGE_CACHE_SIZE, NULL, bget_one);
1323 }
1324
1325 ret = block_write_full_page(page, ext3_get_block);
1326
1327 /*
1328 * The page can become unlocked at any point now, and
1329 * truncate can then come in and change things. So we
1330 * can't touch *page from now on. But *page_buffers is
1331 * safe due to elevated refcount.
1332 */
1333
1334 handle = ext3_journal_current_handle();
1335 lock_kernel();
1336
1337 /* And attach them to the current transaction */
1338 if (order_data) {
1339 err = walk_page_buffers(handle, inode, page_buffers,
1340 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
1341 if (!ret)
1342 ret = err;
1343 }
1344
1345 err = ext3_journal_stop(handle, inode);
1346 if (!ret)
1347 ret = err;
1348 unlock_kernel();
1349 return ret;
1350
1351 out_fail:
1352
1353 unlock_kernel();
1354 SetPageDirty(page);
1355 UnlockPage(page);
1356 return ret;
1357 }
1358
1359 static int ext3_readpage(struct file *file, struct page *page)
1360 {
1361 return block_read_full_page(page,ext3_get_block);
1362 }
1363
1364
1365 static int ext3_flushpage(struct page *page, unsigned long offset)
1366 {
1367 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1368 return journal_flushpage(journal, page, offset);
1369 }
1370
1371 static int ext3_releasepage(struct page *page, int wait)
1372 {
1373 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1374 return journal_try_to_free_buffers(journal, page, wait);
1375 }
1376
1377
1378 struct address_space_operations ext3_aops = {
1379 readpage: ext3_readpage, /* BKL not held. Don't need */
1380 writepage: ext3_writepage, /* BKL not held. We take it */
1381 sync_page: block_sync_page,
1382 prepare_write: ext3_prepare_write, /* BKL not held. We take it */
1383 commit_write: ext3_commit_write, /* BKL not held. We take it */
1384 bmap: ext3_bmap, /* BKL held */
1385 flushpage: ext3_flushpage, /* BKL not held. Don't need */
1386 releasepage: ext3_releasepage, /* BKL not held. Don't need */
1387 };
1388
1389 /*
1390 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1391 * up to the end of the block which corresponds to `from'.
1392 * This required during truncate. We need to physically zero the tail end
1393 * of that block so it doesn't yield old data if the file is later grown.
1394 */
1395 static int ext3_block_truncate_page(handle_t *handle,
1396 struct address_space *mapping, loff_t from)
1397 {
1398 unsigned long index = from >> PAGE_CACHE_SHIFT;
1399 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1400 unsigned blocksize, iblock, length, pos;
1401 struct inode *inode = mapping->host;
1402 struct page *page;
1403 struct buffer_head *bh;
1404 int err;
1405
1406 blocksize = inode->i_sb->s_blocksize;
1407 length = offset & (blocksize - 1);
1408
1409 /* Block boundary? Nothing to do */
1410 if (!length)
1411 return 0;
1412
1413 length = blocksize - length;
1414 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1415
1416 page = find_or_create_page(mapping, index, GFP_NOFS);
1417 err = -ENOMEM;
1418 if (!page)
1419 goto out;
1420
1421 if (!page->buffers)
1422 create_empty_buffers(page, inode->i_dev, blocksize);
1423
1424 /* Find the buffer that contains "offset" */
1425 bh = page->buffers;
1426 pos = blocksize;
1427 while (offset >= pos) {
1428 bh = bh->b_this_page;
1429 iblock++;
1430 pos += blocksize;
1431 }
1432
1433 err = 0;
1434 if (!buffer_mapped(bh)) {
1435 /* Hole? Nothing to do */
1436 if (buffer_uptodate(bh))
1437 goto unlock;
1438 ext3_get_block(inode, iblock, bh, 0);
1439 /* Still unmapped? Nothing to do */
1440 if (!buffer_mapped(bh))
1441 goto unlock;
1442 }
1443
1444 /* Ok, it's mapped. Make sure it's up-to-date */
1445 if (Page_Uptodate(page))
1446 set_bit(BH_Uptodate, &bh->b_state);
1447
1448 if (!buffer_uptodate(bh)) {
1449 err = -EIO;
1450 ll_rw_block(READ, 1, &bh);
1451 wait_on_buffer(bh);
1452 /* Uhhuh. Read error. Complain and punt. */
1453 if (!buffer_uptodate(bh))
1454 goto unlock;
1455 }
1456
1457 if (ext3_should_journal_data(inode)) {
1458 BUFFER_TRACE(bh, "get write access");
1459 err = ext3_journal_get_write_access(handle, bh);
1460 if (err)
1461 goto unlock;
1462 }
1463
1464 memset(kmap(page) + offset, 0, length);
1465 flush_dcache_page(page);
1466 kunmap(page);
1467
1468 BUFFER_TRACE(bh, "zeroed end of block");
1469
1470 err = 0;
1471 if (ext3_should_journal_data(inode)) {
1472 err = ext3_journal_dirty_metadata(handle, bh);
1473 } else {
1474 if (ext3_should_order_data(inode))
1475 err = ext3_journal_dirty_data(handle, bh, 0);
1476 __mark_buffer_dirty(bh);
1477 }
1478
1479 unlock:
1480 UnlockPage(page);
1481 page_cache_release(page);
1482 out:
1483 return err;
1484 }
1485
1486 /*
1487 * Probably it should be a library function... search for first non-zero word
1488 * or memcmp with zero_page, whatever is better for particular architecture.
1489 * Linus?
1490 */
1491 static inline int all_zeroes(u32 *p, u32 *q)
1492 {
1493 while (p < q)
1494 if (*p++)
1495 return 0;
1496 return 1;
1497 }
1498
1499 /**
1500 * ext3_find_shared - find the indirect blocks for partial truncation.
1501 * @inode: inode in question
1502 * @depth: depth of the affected branch
1503 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1504 * @chain: place to store the pointers to partial indirect blocks
1505 * @top: place to the (detached) top of branch
1506 *
1507 * This is a helper function used by ext3_truncate().
1508 *
1509 * When we do truncate() we may have to clean the ends of several
1510 * indirect blocks but leave the blocks themselves alive. Block is
1511 * partially truncated if some data below the new i_size is refered
1512 * from it (and it is on the path to the first completely truncated
1513 * data block, indeed). We have to free the top of that path along
1514 * with everything to the right of the path. Since no allocation
1515 * past the truncation point is possible until ext3_truncate()
1516 * finishes, we may safely do the latter, but top of branch may
1517 * require special attention - pageout below the truncation point
1518 * might try to populate it.
1519 *
1520 * We atomically detach the top of branch from the tree, store the
1521 * block number of its root in *@top, pointers to buffer_heads of
1522 * partially truncated blocks - in @chain[].bh and pointers to
1523 * their last elements that should not be removed - in
1524 * @chain[].p. Return value is the pointer to last filled element
1525 * of @chain.
1526 *
1527 * The work left to caller to do the actual freeing of subtrees:
1528 * a) free the subtree starting from *@top
1529 * b) free the subtrees whose roots are stored in
1530 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1531 * c) free the subtrees growing from the inode past the @chain[0].
1532 * (no partially truncated stuff there). */
1533
1534 static Indirect *ext3_find_shared(struct inode *inode,
1535 int depth,
1536 int offsets[4],
1537 Indirect chain[4],
1538 u32 *top)
1539 {
1540 Indirect *partial, *p;
1541 int k, err;
1542
1543 *top = 0;
1544 /* Make k index the deepest non-null offest + 1 */
1545 for (k = depth; k > 1 && !offsets[k-1]; k--)
1546 ;
1547 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1548 /* Writer: pointers */
1549 if (!partial)
1550 partial = chain + k-1;
1551 /*
1552 * If the branch acquired continuation since we've looked at it -
1553 * fine, it should all survive and (new) top doesn't belong to us.
1554 */
1555 if (!partial->key && *partial->p)
1556 /* Writer: end */
1557 goto no_top;
1558 for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1559 ;
1560 /*
1561 * OK, we've found the last block that must survive. The rest of our
1562 * branch should be detached before unlocking. However, if that rest
1563 * of branch is all ours and does not grow immediately from the inode
1564 * it's easier to cheat and just decrement partial->p.
1565 */
1566 if (p == chain + k - 1 && p > chain) {
1567 p->p--;
1568 } else {
1569 *top = *p->p;
1570 /* Nope, don't do this in ext3. Must leave the tree intact */
1571 #if 0
1572 *p->p = 0;
1573 #endif
1574 }
1575 /* Writer: end */
1576
1577 while(partial > p)
1578 {
1579 brelse(partial->bh);
1580 partial--;
1581 }
1582 no_top:
1583 return partial;
1584 }
1585
1586 /*
1587 * Zero a number of block pointers in either an inode or an indirect block.
1588 * If we restart the transaction we must again get write access to the
1589 * indirect block for further modification.
1590 *
1591 * We release `count' blocks on disk, but (last - first) may be greater
1592 * than `count' because there can be holes in there.
1593 */
1594 static void
1595 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1596 unsigned long block_to_free, unsigned long count,
1597 u32 *first, u32 *last)
1598 {
1599 u32 *p;
1600 if (try_to_extend_transaction(handle, inode)) {
1601 if (bh) {
1602 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1603 ext3_journal_dirty_metadata(handle, bh);
1604 }
1605 ext3_mark_inode_dirty(handle, inode);
1606 ext3_journal_test_restart(handle, inode);
1607 if (bh) {
1608 BUFFER_TRACE(bh, "retaking write access");
1609 ext3_journal_get_write_access(handle, bh);
1610 }
1611 }
1612
1613 /*
1614 * Any buffers which are on the journal will be in memory. We find
1615 * them on the hash table so journal_revoke() will run journal_forget()
1616 * on them. We've already detached each block from the file, so
1617 * bforget() in journal_forget() should be safe.
1618 *
1619 * AKPM: turn on bforget in journal_forget()!!!
1620 */
1621 for (p = first; p < last; p++) {
1622 u32 nr = le32_to_cpu(*p);
1623 if (nr) {
1624 struct buffer_head *bh;
1625
1626 *p = 0;
1627 bh = sb_get_hash_table(inode->i_sb, nr);
1628 ext3_forget(handle, 0, inode, bh, nr);
1629 }
1630 }
1631
1632 ext3_free_blocks(handle, inode, block_to_free, count);
1633 }
1634
1635 /**
1636 * ext3_free_data - free a list of data blocks
1637 * @handle: handle for this transaction
1638 * @inode: inode we are dealing with
1639 * @this_bh: indirect buffer_head which contains *@first and *@last
1640 * @first: array of block numbers
1641 * @last: points immediately past the end of array
1642 *
1643 * We are freeing all blocks refered from that array (numbers are stored as
1644 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1645 *
1646 * We accumulate contiguous runs of blocks to free. Conveniently, if these
1647 * blocks are contiguous then releasing them at one time will only affect one
1648 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1649 * actually use a lot of journal space.
1650 *
1651 * @this_bh will be %NULL if @first and @last point into the inode's direct
1652 * block pointers.
1653 */
1654 static void ext3_free_data(handle_t *handle, struct inode *inode,
1655 struct buffer_head *this_bh, u32 *first, u32 *last)
1656 {
1657 unsigned long block_to_free = 0; /* Starting block # of a run */
1658 unsigned long count = 0; /* Number of blocks in the run */
1659 u32 *block_to_free_p = NULL; /* Pointer into inode/ind
1660 corresponding to
1661 block_to_free */
1662 unsigned long nr; /* Current block # */
1663 u32 *p; /* Pointer into inode/ind
1664 for current block */
1665 int err;
1666
1667 if (this_bh) { /* For indirect block */
1668 BUFFER_TRACE(this_bh, "get_write_access");
1669 err = ext3_journal_get_write_access(handle, this_bh);
1670 /* Important: if we can't update the indirect pointers
1671 * to the blocks, we can't free them. */
1672 if (err)
1673 return;
1674 }
1675
1676 for (p = first; p < last; p++) {
1677 nr = le32_to_cpu(*p);
1678 if (nr) {
1679 /* accumulate blocks to free if they're contiguous */
1680 if (count == 0) {
1681 block_to_free = nr;
1682 block_to_free_p = p;
1683 count = 1;
1684 } else if (nr == block_to_free + count) {
1685 count++;
1686 } else {
1687 ext3_clear_blocks(handle, inode, this_bh,
1688 block_to_free,
1689 count, block_to_free_p, p);
1690 block_to_free = nr;
1691 block_to_free_p = p;
1692 count = 1;
1693 }
1694 }
1695 }
1696
1697 if (count > 0)
1698 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1699 count, block_to_free_p, p);
1700
1701 if (this_bh) {
1702 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1703 ext3_journal_dirty_metadata(handle, this_bh);
1704 }
1705 }
1706
1707 /**
1708 * ext3_free_branches - free an array of branches
1709 * @handle: JBD handle for this transaction
1710 * @inode: inode we are dealing with
1711 * @parent_bh: the buffer_head which contains *@first and *@last
1712 * @first: array of block numbers
1713 * @last: pointer immediately past the end of array
1714 * @depth: depth of the branches to free
1715 *
1716 * We are freeing all blocks refered from these branches (numbers are
1717 * stored as little-endian 32-bit) and updating @inode->i_blocks
1718 * appropriately.
1719 */
1720 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1721 struct buffer_head *parent_bh,
1722 u32 *first, u32 *last, int depth)
1723 {
1724 unsigned long nr;
1725 u32 *p;
1726
1727 if (is_handle_aborted(handle))
1728 return;
1729
1730 if (depth--) {
1731 struct buffer_head *bh;
1732 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1733 p = last;
1734 while (--p >= first) {
1735 nr = le32_to_cpu(*p);
1736 if (!nr)
1737 continue; /* A hole */
1738
1739 /* Go read the buffer for the next level down */
1740 bh = sb_bread(inode->i_sb, nr);
1741
1742 /*
1743 * A read failure? Report error and clear slot
1744 * (should be rare).
1745 */
1746 if (!bh) {
1747 ext3_error(inode->i_sb, "ext3_free_branches",
1748 "Read failure, inode=%ld, block=%ld",
1749 inode->i_ino, nr);
1750 continue;
1751 }
1752
1753 /* This zaps the entire block. Bottom up. */
1754 BUFFER_TRACE(bh, "free child branches");
1755 ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
1756 (u32*)bh->b_data + addr_per_block,
1757 depth);
1758
1759 /*
1760 * We've probably journalled the indirect block several
1761 * times during the truncate. But it's no longer
1762 * needed and we now drop it from the transaction via
1763 * journal_revoke().
1764 *
1765 * That's easy if it's exclusively part of this
1766 * transaction. But if it's part of the committing
1767 * transaction then journal_forget() will simply
1768 * brelse() it. That means that if the underlying
1769 * block is reallocated in ext3_get_block(),
1770 * unmap_underlying_metadata() will find this block
1771 * and will try to get rid of it. damn, damn.
1772 *
1773 * If this block has already been committed to the
1774 * journal, a revoke record will be written. And
1775 * revoke records must be emitted *before* clearing
1776 * this block's bit in the bitmaps.
1777 */
1778 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1779
1780 /*
1781 * Everything below this this pointer has been
1782 * released. Now let this top-of-subtree go.
1783 *
1784 * We want the freeing of this indirect block to be
1785 * atomic in the journal with the updating of the
1786 * bitmap block which owns it. So make some room in
1787 * the journal.
1788 *
1789 * We zero the parent pointer *after* freeing its
1790 * pointee in the bitmaps, so if extend_transaction()
1791 * for some reason fails to put the bitmap changes and
1792 * the release into the same transaction, recovery
1793 * will merely complain about releasing a free block,
1794 * rather than leaking blocks.
1795 */
1796 if (is_handle_aborted(handle))
1797 return;
1798 if (try_to_extend_transaction(handle, inode)) {
1799 ext3_mark_inode_dirty(handle, inode);
1800 ext3_journal_test_restart(handle, inode);
1801 }
1802
1803 ext3_free_blocks(handle, inode, nr, 1);
1804
1805 if (parent_bh) {
1806 /*
1807 * The block which we have just freed is
1808 * pointed to by an indirect block: journal it
1809 */
1810 BUFFER_TRACE(parent_bh, "get_write_access");
1811 if (!ext3_journal_get_write_access(handle,
1812 parent_bh)){
1813 *p = 0;
1814 BUFFER_TRACE(parent_bh,
1815 "call ext3_journal_dirty_metadata");
1816 ext3_journal_dirty_metadata(handle,
1817 parent_bh);
1818 }
1819 }
1820 }
1821 } else {
1822 /* We have reached the bottom of the tree. */
1823 BUFFER_TRACE(parent_bh, "free data blocks");
1824 ext3_free_data(handle, inode, parent_bh, first, last);
1825 }
1826 }
1827
1828 /*
1829 * ext3_truncate()
1830 *
1831 * We block out ext3_get_block() block instantiations across the entire
1832 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
1833 * simultaneously on behalf of the same inode.
1834 *
1835 * As we work through the truncate and commmit bits of it to the journal there
1836 * is one core, guiding principle: the file's tree must always be consistent on
1837 * disk. We must be able to restart the truncate after a crash.
1838 *
1839 * The file's tree may be transiently inconsistent in memory (although it
1840 * probably isn't), but whenever we close off and commit a journal transaction,
1841 * the contents of (the filesystem + the journal) must be consistent and
1842 * restartable. It's pretty simple, really: bottom up, right to left (although
1843 * left-to-right works OK too).
1844 *
1845 * Note that at recovery time, journal replay occurs *before* the restart of
1846 * truncate against the orphan inode list.
1847 *
1848 * The committed inode has the new, desired i_size (which is the same as
1849 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
1850 * that this inode's truncate did not complete and it will again call
1851 * ext3_truncate() to have another go. So there will be instantiated blocks
1852 * to the right of the truncation point in a crashed ext3 filesystem. But
1853 * that's fine - as long as they are linked from the inode, the post-crash
1854 * ext3_truncate() run will find them and release them.
1855 */
1856
1857 void ext3_truncate(struct inode * inode)
1858 {
1859 handle_t *handle;
1860 u32 *i_data = inode->u.ext3_i.i_data;
1861 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1862 int offsets[4];
1863 Indirect chain[4];
1864 Indirect *partial;
1865 int nr = 0;
1866 int n;
1867 long last_block;
1868 unsigned blocksize;
1869
1870 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1871 S_ISLNK(inode->i_mode)))
1872 return;
1873 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1874 return;
1875
1876 ext3_discard_prealloc(inode);
1877
1878 handle = start_transaction(inode);
1879 if (IS_ERR(handle))
1880 return; /* AKPM: return what? */
1881
1882 blocksize = inode->i_sb->s_blocksize;
1883 last_block = (inode->i_size + blocksize-1)
1884 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
1885
1886 ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
1887
1888
1889 n = ext3_block_to_path(inode, last_block, offsets);
1890 if (n == 0)
1891 goto out_stop; /* error */
1892
1893 /*
1894 * OK. This truncate is going to happen. We add the inode to the
1895 * orphan list, so that if this truncate spans multiple transactions,
1896 * and we crash, we will resume the truncate when the filesystem
1897 * recovers. It also marks the inode dirty, to catch the new size.
1898 *
1899 * Implication: the file must always be in a sane, consistent
1900 * truncatable state while each transaction commits.
1901 */
1902 if (ext3_orphan_add(handle, inode))
1903 goto out_stop;
1904
1905 /*
1906 * The orphan list entry will now protect us from any crash which
1907 * occurs before the truncate completes, so it is now safe to propagate
1908 * the new, shorter inode size (held for now in i_size) into the
1909 * on-disk inode. We do this via i_disksize, which is the value which
1910 * ext3 *really* writes onto the disk inode.
1911 */
1912 inode->u.ext3_i.i_disksize = inode->i_size;
1913
1914 /*
1915 * From here we block out all ext3_get_block() callers who want to
1916 * modify the block allocation tree.
1917 */
1918 down_write(&inode->u.ext3_i.truncate_sem);
1919
1920 if (n == 1) { /* direct blocks */
1921 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
1922 i_data + EXT3_NDIR_BLOCKS);
1923 goto do_indirects;
1924 }
1925
1926 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
1927 /* Kill the top of shared branch (not detached) */
1928 if (nr) {
1929 if (partial == chain) {
1930 /* Shared branch grows from the inode */
1931 ext3_free_branches(handle, inode, NULL,
1932 &nr, &nr+1, (chain+n-1) - partial);
1933 *partial->p = 0;
1934 /*
1935 * We mark the inode dirty prior to restart,
1936 * and prior to stop. No need for it here.
1937 */
1938 } else {
1939 /* Shared branch grows from an indirect block */
1940 BUFFER_TRACE(partial->bh, "get_write_access");
1941 ext3_free_branches(handle, inode, partial->bh,
1942 partial->p,
1943 partial->p+1, (chain+n-1) - partial);
1944 }
1945 }
1946 /* Clear the ends of indirect blocks on the shared branch */
1947 while (partial > chain) {
1948 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
1949 (u32*)partial->bh->b_data + addr_per_block,
1950 (chain+n-1) - partial);
1951 BUFFER_TRACE(partial->bh, "call brelse");
1952 brelse (partial->bh);
1953 partial--;
1954 }
1955 do_indirects:
1956 /* Kill the remaining (whole) subtrees */
1957 switch (offsets[0]) {
1958 default:
1959 nr = i_data[EXT3_IND_BLOCK];
1960 if (nr) {
1961 ext3_free_branches(handle, inode, NULL,
1962 &nr, &nr+1, 1);
1963 i_data[EXT3_IND_BLOCK] = 0;
1964 }
1965 case EXT3_IND_BLOCK:
1966 nr = i_data[EXT3_DIND_BLOCK];
1967 if (nr) {
1968 ext3_free_branches(handle, inode, NULL,
1969 &nr, &nr+1, 2);
1970 i_data[EXT3_DIND_BLOCK] = 0;
1971 }
1972 case EXT3_DIND_BLOCK:
1973 nr = i_data[EXT3_TIND_BLOCK];
1974 if (nr) {
1975 ext3_free_branches(handle, inode, NULL,
1976 &nr, &nr+1, 3);
1977 i_data[EXT3_TIND_BLOCK] = 0;
1978 }
1979 case EXT3_TIND_BLOCK:
1980 ;
1981 }
1982 up_write(&inode->u.ext3_i.truncate_sem);
1983 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1984 ext3_mark_inode_dirty(handle, inode);
1985
1986 /* In a multi-transaction truncate, we only make the final
1987 * transaction synchronous */
1988 if (IS_SYNC(inode))
1989 handle->h_sync = 1;
1990 out_stop:
1991 /*
1992 * If this was a simple ftruncate(), and the file will remain alive
1993 * then we need to clear up the orphan record which we created above.
1994 * However, if this was a real unlink then we were called by
1995 * ext3_delete_inode(), and we allow that function to clean up the
1996 * orphan info for us.
1997 */
1998 if (inode->i_nlink)
1999 ext3_orphan_del(handle, inode);
2000
2001 ext3_journal_stop(handle, inode);
2002 }
2003
2004 /*
2005 * ext3_get_inode_loc returns with an extra refcount against the
2006 * inode's underlying buffer_head on success.
2007 */
2008
2009 int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
2010 {
2011 struct buffer_head *bh = 0;
2012 unsigned long block;
2013 unsigned long block_group;
2014 unsigned long group_desc;
2015 unsigned long desc;
2016 unsigned long offset;
2017 struct ext3_group_desc * gdp;
2018
2019 if ((inode->i_ino != EXT3_ROOT_INO &&
2020 inode->i_ino != EXT3_ACL_IDX_INO &&
2021 inode->i_ino != EXT3_ACL_DATA_INO &&
2022 inode->i_ino != EXT3_JOURNAL_INO &&
2023 inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
2024 inode->i_ino > le32_to_cpu(
2025 inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
2026 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2027 "bad inode number: %lu", inode->i_ino);
2028 goto bad_inode;
2029 }
2030 block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
2031 if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
2032 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2033 "group >= groups count");
2034 goto bad_inode;
2035 }
2036 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
2037 desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
2038 bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
2039 if (!bh) {
2040 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2041 "Descriptor not loaded");
2042 goto bad_inode;
2043 }
2044
2045 gdp = (struct ext3_group_desc *) bh->b_data;
2046 /*
2047 * Figure out the offset within the block group inode table
2048 */
2049 offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
2050 EXT3_INODE_SIZE(inode->i_sb);
2051 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2052 (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
2053 if (!(bh = sb_bread(inode->i_sb, block))) {
2054 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2055 "unable to read inode block - "
2056 "inode=%lu, block=%lu", inode->i_ino, block);
2057 goto bad_inode;
2058 }
2059 offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
2060
2061 iloc->bh = bh;
2062 iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
2063 iloc->block_group = block_group;
2064
2065 return 0;
2066
2067 bad_inode:
2068 return -EIO;
2069 }
2070
2071 void ext3_set_inode_flags(struct inode *inode)
2072 {
2073 unsigned int flags = inode->u.ext3_i.i_flags;
2074
2075 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME);
2076 if (flags & EXT3_SYNC_FL)
2077 inode->i_flags |= S_SYNC;
2078 if (flags & EXT3_APPEND_FL)
2079 inode->i_flags |= S_APPEND;
2080 if (flags & EXT3_IMMUTABLE_FL)
2081 inode->i_flags |= S_IMMUTABLE;
2082 if (flags & EXT3_NOATIME_FL)
2083 inode->i_flags |= S_NOATIME;
2084 }
2085
2086
2087 void ext3_read_inode(struct inode * inode)
2088 {
2089 struct ext3_iloc iloc;
2090 struct ext3_inode *raw_inode;
2091 struct buffer_head *bh;
2092 int block;
2093
2094 if(ext3_get_inode_loc(inode, &iloc))
2095 goto bad_inode;
2096 bh = iloc.bh;
2097 raw_inode = iloc.raw_inode;
2098 init_rwsem(&inode->u.ext3_i.truncate_sem);
2099 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2100 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2101 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2102 if(!(test_opt (inode->i_sb, NO_UID32))) {
2103 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2104 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2105 }
2106 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2107 inode->i_size = le32_to_cpu(raw_inode->i_size);
2108 inode->i_atime = le32_to_cpu(raw_inode->i_atime);
2109 inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
2110 inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
2111 inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
2112 /* We now have enough fields to check if the inode was active or not.
2113 * This is needed because nfsd might try to access dead inodes
2114 * the test is that same one that e2fsck uses
2115 * NeilBrown 1999oct15
2116 */
2117 if (inode->i_nlink == 0) {
2118 if (inode->i_mode == 0 ||
2119 !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
2120 /* this inode is deleted */
2121 brelse (bh);
2122 goto bad_inode;
2123 }
2124 /* The only unlinked inodes we let through here have
2125 * valid i_mode and are being read by the orphan
2126 * recovery code: that's fine, we're about to complete
2127 * the process of deleting those. */
2128 }
2129 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2130 * (for stat), not the fs block
2131 * size */
2132 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2133 inode->i_version = ++event;
2134 inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
2135 #ifdef EXT3_FRAGMENTS
2136 inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
2137 inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
2138 inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
2139 #endif
2140 inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2141 if (!S_ISREG(inode->i_mode)) {
2142 inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2143 } else {
2144 inode->i_size |=
2145 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2146 }
2147 inode->u.ext3_i.i_disksize = inode->i_size;
2148 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2149 #ifdef EXT3_PREALLOCATE
2150 inode->u.ext3_i.i_prealloc_count = 0;
2151 #endif
2152 inode->u.ext3_i.i_block_group = iloc.block_group;
2153
2154 /*
2155 * NOTE! The in-memory inode i_data array is in little-endian order
2156 * even on big-endian machines: we do NOT byteswap the block numbers!
2157 */
2158 for (block = 0; block < EXT3_N_BLOCKS; block++)
2159 inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
2160 INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
2161
2162 if (inode->i_ino == EXT3_ACL_IDX_INO ||
2163 inode->i_ino == EXT3_ACL_DATA_INO)
2164 /* Nothing to do */ ;
2165 else if (S_ISREG(inode->i_mode)) {
2166 inode->i_op = &ext3_file_inode_operations;
2167 inode->i_fop = &ext3_file_operations;
2168 inode->i_mapping->a_ops = &ext3_aops;
2169 } else if (S_ISDIR(inode->i_mode)) {
2170 inode->i_op = &ext3_dir_inode_operations;
2171 inode->i_fop = &ext3_dir_operations;
2172 } else if (S_ISLNK(inode->i_mode)) {
2173 if (!inode->i_blocks)
2174 inode->i_op = &ext3_fast_symlink_inode_operations;
2175 else {
2176 inode->i_op = &page_symlink_inode_operations;
2177 inode->i_mapping->a_ops = &ext3_aops;
2178 }
2179 } else
2180 init_special_inode(inode, inode->i_mode,
2181 le32_to_cpu(iloc.raw_inode->i_block[0]));
2182 brelse(iloc.bh);
2183 ext3_set_inode_flags(inode);
2184 return;
2185
2186 bad_inode:
2187 make_bad_inode(inode);
2188 return;
2189 }
2190
2191 /*
2192 * Post the struct inode info into an on-disk inode location in the
2193 * buffer-cache. This gobbles the caller's reference to the
2194 * buffer_head in the inode location struct.
2195 */
2196
2197 static int ext3_do_update_inode(handle_t *handle,
2198 struct inode *inode,
2199 struct ext3_iloc *iloc)
2200 {
2201 struct ext3_inode *raw_inode = iloc->raw_inode;
2202 struct buffer_head *bh = iloc->bh;
2203 int err = 0, rc, block;
2204
2205 if (handle) {
2206 BUFFER_TRACE(bh, "get_write_access");
2207 err = ext3_journal_get_write_access(handle, bh);
2208 if (err)
2209 goto out_brelse;
2210 }
2211 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2212 if(!(test_opt(inode->i_sb, NO_UID32))) {
2213 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2214 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2215 /*
2216 * Fix up interoperability with old kernels. Otherwise, old inodes get
2217 * re-used with the upper 16 bits of the uid/gid intact
2218 */
2219 if(!inode->u.ext3_i.i_dtime) {
2220 raw_inode->i_uid_high =
2221 cpu_to_le16(high_16_bits(inode->i_uid));
2222 raw_inode->i_gid_high =
2223 cpu_to_le16(high_16_bits(inode->i_gid));
2224 } else {
2225 raw_inode->i_uid_high = 0;
2226 raw_inode->i_gid_high = 0;
2227 }
2228 } else {
2229 raw_inode->i_uid_low =
2230 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2231 raw_inode->i_gid_low =
2232 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2233 raw_inode->i_uid_high = 0;
2234 raw_inode->i_gid_high = 0;
2235 }
2236 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2237 raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
2238 raw_inode->i_atime = cpu_to_le32(inode->i_atime);
2239 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
2240 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
2241 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2242 raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
2243 raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
2244 #ifdef EXT3_FRAGMENTS
2245 raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
2246 raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
2247 raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
2248 #else
2249 /* If we are not tracking these fields in the in-memory inode,
2250 * then preserve them on disk, but still initialise them to zero
2251 * for new inodes. */
2252 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
2253 raw_inode->i_faddr = 0;
2254 raw_inode->i_frag = 0;
2255 raw_inode->i_fsize = 0;
2256 }
2257 #endif
2258 raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
2259 if (!S_ISREG(inode->i_mode)) {
2260 raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
2261 } else {
2262 raw_inode->i_size_high =
2263 cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
2264 if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
2265 struct super_block *sb = inode->i_sb;
2266 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2267 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2268 EXT3_SB(sb)->s_es->s_rev_level ==
2269 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2270 /* If this is the first large file
2271 * created, add a flag to the superblock.
2272 */
2273 err = ext3_journal_get_write_access(handle,
2274 sb->u.ext3_sb.s_sbh);
2275 if (err)
2276 goto out_brelse;
2277 ext3_update_dynamic_rev(sb);
2278 EXT3_SET_RO_COMPAT_FEATURE(sb,
2279 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2280 sb->s_dirt = 1;
2281 handle->h_sync = 1;
2282 err = ext3_journal_dirty_metadata(handle,
2283 sb->u.ext3_sb.s_sbh);
2284 }
2285 }
2286 }
2287 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2288 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
2289 raw_inode->i_block[0] =
2290 cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
2291 else for (block = 0; block < EXT3_N_BLOCKS; block++)
2292 raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
2293
2294 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2295 rc = ext3_journal_dirty_metadata(handle, bh);
2296 if (!err)
2297 err = rc;
2298 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
2299
2300 out_brelse:
2301 brelse (bh);
2302 ext3_std_error(inode->i_sb, err);
2303 return err;
2304 }
2305
2306 /*
2307 * ext3_write_inode()
2308 *
2309 * We are called from a few places:
2310 *
2311 * - Within generic_file_write() for O_SYNC files.
2312 * Here, there will be no transaction running. We wait for any running
2313 * trasnaction to commit.
2314 *
2315 * - Within sys_sync(), kupdate and such.
2316 * We wait on commit, if tol to.
2317 *
2318 * - Within prune_icache() (PF_MEMALLOC == true)
2319 * Here we simply return. We can't afford to block kswapd on the
2320 * journal commit.
2321 *
2322 * In all cases it is actually safe for us to return without doing anything,
2323 * because the inode has been copied into a raw inode buffer in
2324 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2325 * knfsd.
2326 *
2327 * Note that we are absolutely dependent upon all inode dirtiers doing the
2328 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2329 * which we are interested.
2330 *
2331 * It would be a bug for them to not do this. The code:
2332 *
2333 * mark_inode_dirty(inode)
2334 * stuff();
2335 * inode->i_size = expr;
2336 *
2337 * is in error because a kswapd-driven write_inode() could occur while
2338 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2339 * will no longer be on the superblock's dirty inode list.
2340 */
2341 void ext3_write_inode(struct inode *inode, int wait)
2342 {
2343 if (current->flags & PF_MEMALLOC)
2344 return;
2345
2346 if (ext3_journal_current_handle()) {
2347 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2348 return;
2349 }
2350
2351 if (!wait)
2352 return;
2353
2354 ext3_force_commit(inode->i_sb);
2355 }
2356
2357 /*
2358 * ext3_setattr()
2359 *
2360 * Called from notify_change.
2361 *
2362 * We want to trap VFS attempts to truncate the file as soon as
2363 * possible. In particular, we want to make sure that when the VFS
2364 * shrinks i_size, we put the inode on the orphan list and modify
2365 * i_disksize immediately, so that during the subsequent flushing of
2366 * dirty pages and freeing of disk blocks, we can guarantee that any
2367 * commit will leave the blocks being flushed in an unused state on
2368 * disk. (On recovery, the inode will get truncated and the blocks will
2369 * be freed, so we have a strong guarantee that no future commit will
2370 * leave these blocks visible to the user.)
2371 *
2372 * This is only needed for regular files. rmdir() has its own path, and
2373 * we can never truncate a direcory except on final unlink (at which
2374 * point i_nlink is zero so recovery is easy.)
2375 *
2376 * Called with the BKL.
2377 */
2378
2379 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2380 {
2381 struct inode *inode = dentry->d_inode;
2382 int error, rc = 0;
2383 const unsigned int ia_valid = attr->ia_valid;
2384
2385 error = inode_change_ok(inode, attr);
2386 if (error)
2387 return error;
2388
2389 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2390 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2391 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2392 if (error)
2393 return error;
2394 }
2395
2396 if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2397 handle_t *handle;
2398
2399 handle = ext3_journal_start(inode, 3);
2400 if (IS_ERR(handle)) {
2401 error = PTR_ERR(handle);
2402 goto err_out;
2403 }
2404
2405 error = ext3_orphan_add(handle, inode);
2406 inode->u.ext3_i.i_disksize = attr->ia_size;
2407 rc = ext3_mark_inode_dirty(handle, inode);
2408 if (!error)
2409 error = rc;
2410 ext3_journal_stop(handle, inode);
2411 }
2412
2413 rc = inode_setattr(inode, attr);
2414
2415 /* If inode_setattr's call to ext3_truncate failed to get a
2416 * transaction handle at all, we need to clean up the in-core
2417 * orphan list manually. */
2418 if (inode->i_nlink)
2419 ext3_orphan_del(NULL, inode);
2420
2421 err_out:
2422 ext3_std_error(inode->i_sb, error);
2423 if (!error)
2424 error = rc;
2425 return error;
2426 }
2427
2428
2429 /*
2430 * akpm: how many blocks doth make a writepage()?
2431 *
2432 * With N blocks per page, it may be:
2433 * N data blocks
2434 * 2 indirect block
2435 * 2 dindirect
2436 * 1 tindirect
2437 * N+5 bitmap blocks (from the above)
2438 * N+5 group descriptor summary blocks
2439 * 1 inode block
2440 * 1 superblock.
2441 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2442 *
2443 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2444 *
2445 * With ordered or writeback data it's the same, less the N data blocks.
2446 *
2447 * If the inode's direct blocks can hold an integral number of pages then a
2448 * page cannot straddle two indirect blocks, and we can only touch one indirect
2449 * and dindirect block, and the "5" above becomes "3".
2450 *
2451 * This still overestimates under most circumstances. If we were to pass the
2452 * start and end offsets in here as well we could do block_to_path() on each
2453 * block and work out the exact number of indirects which are touched. Pah.
2454 */
2455
2456 int ext3_writepage_trans_blocks(struct inode *inode)
2457 {
2458 int bpp = ext3_journal_blocks_per_page(inode);
2459 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2460 int ret;
2461
2462 if (ext3_should_journal_data(inode))
2463 ret = 3 * (bpp + indirects) + 2;
2464 else
2465 ret = 2 * (bpp + indirects) + 2;
2466
2467 #ifdef CONFIG_QUOTA
2468 ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
2469 #endif
2470
2471 return ret;
2472 }
2473
2474 int
2475 ext3_mark_iloc_dirty(handle_t *handle,
2476 struct inode *inode,
2477 struct ext3_iloc *iloc)
2478 {
2479 int err = 0;
2480
2481 if (handle) {
2482 /* the do_update_inode consumes one bh->b_count */
2483 atomic_inc(&iloc->bh->b_count);
2484 err = ext3_do_update_inode(handle, inode, iloc);
2485 /* ext3_do_update_inode() does journal_dirty_metadata */
2486 brelse(iloc->bh);
2487 } else {
2488 printk(KERN_EMERG "%s: called with no handle!\n", __FUNCTION__);
2489 }
2490 return err;
2491 }
2492
2493 /*
2494 * On success, We end up with an outstanding reference count against
2495 * iloc->bh. This _must_ be cleaned up later.
2496 */
2497
2498 int
2499 ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
2500 struct ext3_iloc *iloc)
2501 {
2502 int err = 0;
2503 if (handle) {
2504 err = ext3_get_inode_loc(inode, iloc);
2505 if (!err) {
2506 BUFFER_TRACE(iloc->bh, "get_write_access");
2507 err = ext3_journal_get_write_access(handle, iloc->bh);
2508 if (err) {
2509 brelse(iloc->bh);
2510 iloc->bh = NULL;
2511 }
2512 }
2513 }
2514 ext3_std_error(inode->i_sb, err);
2515 return err;
2516 }
2517
2518 /*
2519 * akpm: What we do here is to mark the in-core inode as clean
2520 * with respect to inode dirtiness (it may still be data-dirty).
2521 * This means that the in-core inode may be reaped by prune_icache
2522 * without having to perform any I/O. This is a very good thing,
2523 * because *any* task may call prune_icache - even ones which
2524 * have a transaction open against a different journal.
2525 *
2526 * Is this cheating? Not really. Sure, we haven't written the
2527 * inode out, but prune_icache isn't a user-visible syncing function.
2528 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2529 * we start and wait on commits.
2530 *
2531 * Is this efficient/effective? Well, we're being nice to the system
2532 * by cleaning up our inodes proactively so they can be reaped
2533 * without I/O. But we are potentially leaving up to five seconds'
2534 * worth of inodes floating about which prune_icache wants us to
2535 * write out. One way to fix that would be to get prune_icache()
2536 * to do a write_super() to free up some memory. It has the desired
2537 * effect.
2538 */
2539 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2540 {
2541 struct ext3_iloc iloc;
2542 int err;
2543
2544 err = ext3_reserve_inode_write(handle, inode, &iloc);
2545 if (!err)
2546 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2547 return err;
2548 }
2549
2550 /*
2551 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2552 *
2553 * We're really interested in the case where a file is being extended.
2554 * i_size has been changed by generic_commit_write() and we thus need
2555 * to include the updated inode in the current transaction.
2556 *
2557 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2558 * are allocated to the file.
2559 *
2560 * If the inode is marked synchronous, we don't honour that here - doing
2561 * so would cause a commit on atime updates, which we don't bother doing.
2562 * We handle synchronous inodes at the highest possible level.
2563 */
2564 void ext3_dirty_inode(struct inode *inode)
2565 {
2566 handle_t *current_handle = ext3_journal_current_handle();
2567 handle_t *handle;
2568
2569 lock_kernel();
2570 handle = ext3_journal_start(inode, 2);
2571 if (IS_ERR(handle))
2572 goto out;
2573 if (current_handle &&
2574 current_handle->h_transaction != handle->h_transaction) {
2575 /* This task has a transaction open against a different fs */
2576 printk(KERN_EMERG "%s: transactions do not match!\n",
2577 __FUNCTION__);
2578 } else {
2579 jbd_debug(5, "marking dirty. outer handle=%p\n",
2580 current_handle);
2581 ext3_mark_inode_dirty(handle, inode);
2582 }
2583 ext3_journal_stop(handle, inode);
2584 out:
2585 unlock_kernel();
2586 }
2587
2588 #ifdef AKPM
2589 /*
2590 * Bind an inode's backing buffer_head into this transaction, to prevent
2591 * it from being flushed to disk early. Unlike
2592 * ext3_reserve_inode_write, this leaves behind no bh reference and
2593 * returns no iloc structure, so the caller needs to repeat the iloc
2594 * lookup to mark the inode dirty later.
2595 */
2596 static inline int
2597 ext3_pin_inode(handle_t *handle, struct inode *inode)
2598 {
2599 struct ext3_iloc iloc;
2600
2601 int err = 0;
2602 if (handle) {
2603 err = ext3_get_inode_loc(inode, &iloc);
2604 if (!err) {
2605 BUFFER_TRACE(iloc.bh, "get_write_access");
2606 err = journal_get_write_access(handle, iloc.bh);
2607 if (!err)
2608 err = ext3_journal_dirty_metadata(handle,
2609 iloc.bh);
2610 brelse(iloc.bh);
2611 }
2612 }
2613 ext3_std_error(inode->i_sb, err);
2614 return err;
2615 }
2616 #endif
2617
2618 int ext3_change_inode_journal_flag(struct inode *inode, int val)
2619 {
2620 journal_t *journal;
2621 handle_t *handle;
2622 int err;
2623
2624 /*
2625 * We have to be very careful here: changing a data block's
2626 * journaling status dynamically is dangerous. If we write a
2627 * data block to the journal, change the status and then delete
2628 * that block, we risk forgetting to revoke the old log record
2629 * from the journal and so a subsequent replay can corrupt data.
2630 * So, first we make sure that the journal is empty and that
2631 * nobody is changing anything.
2632 */
2633
2634 journal = EXT3_JOURNAL(inode);
2635 if (is_journal_aborted(journal) || IS_RDONLY(inode))
2636 return -EROFS;
2637
2638 journal_lock_updates(journal);
2639 journal_flush(journal);
2640
2641 /*
2642 * OK, there are no updates running now, and all cached data is
2643 * synced to disk. We are now in a completely consistent state
2644 * which doesn't have anything in the journal, and we know that
2645 * no filesystem updates are running, so it is safe to modify
2646 * the inode's in-core data-journaling state flag now.
2647 */
2648
2649 if (val)
2650 inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
2651 else
2652 inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
2653
2654 journal_unlock_updates(journal);
2655
2656 /* Finally we can mark the inode as dirty. */
2657
2658 handle = ext3_journal_start(inode, 1);
2659 if (IS_ERR(handle))
2660 return PTR_ERR(handle);
2661
2662 err = ext3_mark_inode_dirty(handle, inode);
2663 handle->h_sync = 1;
2664 ext3_journal_stop(handle, inode);
2665 ext3_std_error(inode->i_sb, err);
2666
2667 return err;
2668 }
2669
2670
2671 /*
2672 * ext3_aops_journal_start().
2673 *
2674 * <This function died, but the comment lives on>
2675 *
2676 * We need to take the inode semaphore *outside* the
2677 * journal_start/journal_stop. Otherwise, a different task could do a
2678 * wait_for_commit() while holding ->i_sem, which deadlocks. The rule
2679 * is: transaction open/closes are considered to be a locking operation
2680 * and they nest *inside* ->i_sem.
2681 * ----------------------------------------------------------------------------
2682 * Possible problem:
2683 * ext3_file_write()
2684 * -> generic_file_write()
2685 * -> __alloc_pages()
2686 * -> page_launder()
2687 * -> ext3_writepage()
2688 *
2689 * And the writepage can be on a different fs while we have a
2690 * transaction open against this one! Bad.
2691 *
2692 * I tried making the task PF_MEMALLOC here, but that simply results in
2693 * 0-order allocation failures passed back to generic_file_write().
2694 * Instead, we rely on the reentrancy protection in ext3_writepage().
2695 * ----------------------------------------------------------------------------
2696 * When we do the journal_start() here we don't really need to reserve
2697 * any blocks - we won't need any until we hit ext3_prepare_write(),
2698 * which does all the needed journal extending. However! There is a
2699 * problem with quotas:
2700 *
2701 * Thread 1:
2702 * sys_sync
2703 * ->sync_dquots
2704 * ->commit_dquot
2705 * ->lock_dquot
2706 * ->write_dquot
2707 * ->ext3_file_write
2708 * ->journal_start
2709 * ->ext3_prepare_write
2710 * ->journal_extend
2711 * ->journal_start
2712 * Thread 2:
2713 * ext3_create (for example)
2714 * ->ext3_new_inode
2715 * ->dquot_initialize
2716 * ->lock_dquot
2717 *
2718 * Deadlock. Thread 1's journal_start blocks because thread 2 has a
2719 * transaction open. Thread 2's transaction will never close because
2720 * thread 2 is stuck waiting for the dquot lock.
2721 *
2722 * So. We must ensure that thread 1 *never* needs to extend the journal
2723 * for quota writes. We do that by reserving enough journal blocks
2724 * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
2725 * need to extend" test in ext3_prepare_write() succeeds.
2726 */
Cache object: 71c686a12cfdb7990dfa2dced730ceee
|