[PATCH] map multiple blocks for mpage_readpages()
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / fs / ext3 / inode.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext3_jbd.h>
29#include <linux/jbd.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include "xattr.h"
40#include "acl.h"
41
42static int ext3_writepage_trans_blocks(struct inode *inode);
43
44/*
45 * Test whether an inode is a fast symlink.
46 */
47static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48{
49 int ea_blocks = EXT3_I(inode)->i_file_acl ?
50 (inode->i_sb->s_blocksize >> 9) : 0;
51
52 return (S_ISLNK(inode->i_mode) &&
53 inode->i_blocks - ea_blocks == 0);
54}
55
56/* The ext3 forget function must perform a revoke if we are freeing data
57 * which has been journaled. Metadata (eg. indirect blocks) must be
58 * revoked in all cases.
59 *
60 * "bh" may be NULL: a metadata block may have been freed from memory
61 * but there may still be a record of it in the journal, and that record
62 * still needs to be revoked.
63 */
64
65int ext3_forget(handle_t *handle, int is_metadata,
66 struct inode *inode, struct buffer_head *bh,
67 int blocknr)
68{
69 int err;
70
71 might_sleep();
72
73 BUFFER_TRACE(bh, "enter");
74
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76 "data mode %lx\n",
77 bh, is_metadata, inode->i_mode,
78 test_opt(inode->i_sb, DATA_FLAGS));
79
80 /* Never use the revoke function if we are doing full data
81 * journaling: there is no need to, and a V1 superblock won't
82 * support it. Otherwise, only skip the revoke on un-journaled
83 * data blocks. */
84
85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86 (!is_metadata && !ext3_should_journal_data(inode))) {
87 if (bh) {
88 BUFFER_TRACE(bh, "call journal_forget");
89 return ext3_journal_forget(handle, bh);
90 }
91 return 0;
92 }
93
94 /*
95 * data!=journal && (is_metadata || should_journal_data(inode))
96 */
97 BUFFER_TRACE(bh, "call ext3_journal_revoke");
98 err = ext3_journal_revoke(handle, blocknr, bh);
99 if (err)
100 ext3_abort(inode->i_sb, __FUNCTION__,
101 "error %d when attempting revoke", err);
102 BUFFER_TRACE(bh, "exit");
103 return err;
104}
105
106/*
107 * Work out how many blocks we need to progress with the next chunk of a
108 * truncate transaction.
109 */
110
111static unsigned long blocks_for_truncate(struct inode *inode)
112{
113 unsigned long needed;
114
115 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117 /* Give ourselves just enough room to cope with inodes in which
118 * i_blocks is corrupt: we've seen disk corruptions in the past
119 * which resulted in random data in an inode which looked enough
120 * like a regular file for ext3 to try to delete it. Things
121 * will go a bit crazy if that happens, but at least we should
122 * try not to panic the whole kernel. */
123 if (needed < 2)
124 needed = 2;
125
126 /* But we need to bound the transaction so we don't overflow the
127 * journal. */
128 if (needed > EXT3_MAX_TRANS_DATA)
129 needed = EXT3_MAX_TRANS_DATA;
130
1f54587b 131 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
1da177e4
LT
132}
133
134/*
135 * Truncate transactions can be complex and absolutely huge. So we need to
136 * be able to restart the transaction at a conventient checkpoint to make
137 * sure we don't overflow the journal.
138 *
139 * start_transaction gets us a new handle for a truncate transaction,
140 * and extend_transaction tries to extend the existing one a bit. If
141 * extend fails, we need to propagate the failure up and restart the
142 * transaction in the top-level truncate loop. --sct
143 */
144
145static handle_t *start_transaction(struct inode *inode)
146{
147 handle_t *result;
148
149 result = ext3_journal_start(inode, blocks_for_truncate(inode));
150 if (!IS_ERR(result))
151 return result;
152
153 ext3_std_error(inode->i_sb, PTR_ERR(result));
154 return result;
155}
156
157/*
158 * Try to extend this transaction for the purposes of truncation.
159 *
160 * Returns 0 if we managed to create more room. If we can't create more
161 * room, and the transaction must be restarted we return 1.
162 */
163static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164{
165 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166 return 0;
167 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168 return 0;
169 return 1;
170}
171
172/*
173 * Restart the transaction associated with *handle. This does a commit,
174 * so before we call here everything must be consistently dirtied against
175 * this transaction.
176 */
177static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178{
179 jbd_debug(2, "restarting handle %p\n", handle);
180 return ext3_journal_restart(handle, blocks_for_truncate(inode));
181}
182
183/*
184 * Called at the last iput() if i_nlink is zero.
185 */
186void ext3_delete_inode (struct inode * inode)
187{
188 handle_t *handle;
189
fef26658
MF
190 truncate_inode_pages(&inode->i_data, 0);
191
1da177e4
LT
192 if (is_bad_inode(inode))
193 goto no_delete;
194
195 handle = start_transaction(inode);
196 if (IS_ERR(handle)) {
197 /* If we're going to skip the normal cleanup, we still
198 * need to make sure that the in-core orphan linked list
199 * is properly cleaned up. */
200 ext3_orphan_del(NULL, inode);
201 goto no_delete;
202 }
203
204 if (IS_SYNC(inode))
205 handle->h_sync = 1;
206 inode->i_size = 0;
207 if (inode->i_blocks)
208 ext3_truncate(inode);
209 /*
210 * Kill off the orphan record which ext3_truncate created.
211 * AKPM: I think this can be inside the above `if'.
212 * Note that ext3_orphan_del() has to be able to cope with the
213 * deletion of a non-existent orphan - this is because we don't
214 * know if ext3_truncate() actually created an orphan record.
215 * (Well, we could do this if we need to, but heck - it works)
216 */
217 ext3_orphan_del(handle, inode);
218 EXT3_I(inode)->i_dtime = get_seconds();
219
220 /*
221 * One subtle ordering requirement: if anything has gone wrong
222 * (transaction abort, IO errors, whatever), then we can still
223 * do these next steps (the fs will already have been marked as
224 * having errors), but we can't free the inode if the mark_dirty
225 * fails.
226 */
227 if (ext3_mark_inode_dirty(handle, inode))
228 /* If that failed, just do the required in-core inode clear. */
229 clear_inode(inode);
230 else
231 ext3_free_inode(handle, inode);
232 ext3_journal_stop(handle);
233 return;
234no_delete:
235 clear_inode(inode); /* We must guarantee clearing of inode... */
236}
237
1da177e4
LT
238typedef struct {
239 __le32 *p;
240 __le32 key;
241 struct buffer_head *bh;
242} Indirect;
243
244static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
245{
246 p->key = *(p->p = v);
247 p->bh = bh;
248}
249
250static inline int verify_chain(Indirect *from, Indirect *to)
251{
252 while (from <= to && from->key == *from->p)
253 from++;
254 return (from > to);
255}
256
257/**
258 * ext3_block_to_path - parse the block number into array of offsets
259 * @inode: inode in question (we are only interested in its superblock)
260 * @i_block: block number to be parsed
261 * @offsets: array to store the offsets in
262 * @boundary: set this non-zero if the referred-to block is likely to be
263 * followed (on disk) by an indirect block.
264 *
265 * To store the locations of file's data ext3 uses a data structure common
266 * for UNIX filesystems - tree of pointers anchored in the inode, with
267 * data blocks at leaves and indirect blocks in intermediate nodes.
268 * This function translates the block number into path in that tree -
269 * return value is the path length and @offsets[n] is the offset of
270 * pointer to (n+1)th node in the nth one. If @block is out of range
271 * (negative or too large) warning is printed and zero returned.
272 *
273 * Note: function doesn't find node addresses, so no IO is needed. All
274 * we need to know is the capacity of indirect blocks (taken from the
275 * inode->i_sb).
276 */
277
278/*
279 * Portability note: the last comparison (check that we fit into triple
280 * indirect block) is spelled differently, because otherwise on an
281 * architecture with 32-bit longs and 8Kb pages we might get into trouble
282 * if our filesystem had 8Kb blocks. We might use long long, but that would
283 * kill us on x86. Oh, well, at least the sign propagation does not matter -
284 * i_block would have to be negative in the very beginning, so we would not
285 * get there at all.
286 */
287
288static int ext3_block_to_path(struct inode *inode,
289 long i_block, int offsets[4], int *boundary)
290{
291 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
292 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
293 const long direct_blocks = EXT3_NDIR_BLOCKS,
294 indirect_blocks = ptrs,
295 double_blocks = (1 << (ptrs_bits * 2));
296 int n = 0;
297 int final = 0;
298
299 if (i_block < 0) {
300 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
301 } else if (i_block < direct_blocks) {
302 offsets[n++] = i_block;
303 final = direct_blocks;
304 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
305 offsets[n++] = EXT3_IND_BLOCK;
306 offsets[n++] = i_block;
307 final = ptrs;
308 } else if ((i_block -= indirect_blocks) < double_blocks) {
309 offsets[n++] = EXT3_DIND_BLOCK;
310 offsets[n++] = i_block >> ptrs_bits;
311 offsets[n++] = i_block & (ptrs - 1);
312 final = ptrs;
313 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
314 offsets[n++] = EXT3_TIND_BLOCK;
315 offsets[n++] = i_block >> (ptrs_bits * 2);
316 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
317 offsets[n++] = i_block & (ptrs - 1);
318 final = ptrs;
319 } else {
320 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
321 }
322 if (boundary)
89747d36 323 *boundary = final - 1 - (i_block & (ptrs - 1));
1da177e4
LT
324 return n;
325}
326
327/**
328 * ext3_get_branch - read the chain of indirect blocks leading to data
329 * @inode: inode in question
330 * @depth: depth of the chain (1 - direct pointer, etc.)
331 * @offsets: offsets of pointers in inode/indirect blocks
332 * @chain: place to store the result
333 * @err: here we store the error value
334 *
335 * Function fills the array of triples <key, p, bh> and returns %NULL
336 * if everything went OK or the pointer to the last filled triple
337 * (incomplete one) otherwise. Upon the return chain[i].key contains
338 * the number of (i+1)-th block in the chain (as it is stored in memory,
339 * i.e. little-endian 32-bit), chain[i].p contains the address of that
340 * number (it points into struct inode for i==0 and into the bh->b_data
341 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
342 * block for i>0 and NULL for i==0. In other words, it holds the block
343 * numbers of the chain, addresses they were taken from (and where we can
344 * verify that chain did not change) and buffer_heads hosting these
345 * numbers.
346 *
347 * Function stops when it stumbles upon zero pointer (absent block)
348 * (pointer to last triple returned, *@err == 0)
349 * or when it gets an IO error reading an indirect block
350 * (ditto, *@err == -EIO)
351 * or when it notices that chain had been changed while it was reading
352 * (ditto, *@err == -EAGAIN)
353 * or when it reads all @depth-1 indirect blocks successfully and finds
354 * the whole chain, all way to the data (returns %NULL, *err == 0).
355 */
356static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
357 Indirect chain[4], int *err)
358{
359 struct super_block *sb = inode->i_sb;
360 Indirect *p = chain;
361 struct buffer_head *bh;
362
363 *err = 0;
364 /* i_data is not going away, no lock needed */
365 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
366 if (!p->key)
367 goto no_block;
368 while (--depth) {
369 bh = sb_bread(sb, le32_to_cpu(p->key));
370 if (!bh)
371 goto failure;
372 /* Reader: pointers */
373 if (!verify_chain(chain, p))
374 goto changed;
375 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
376 /* Reader: end */
377 if (!p->key)
378 goto no_block;
379 }
380 return NULL;
381
382changed:
383 brelse(bh);
384 *err = -EAGAIN;
385 goto no_block;
386failure:
387 *err = -EIO;
388no_block:
389 return p;
390}
391
392/**
393 * ext3_find_near - find a place for allocation with sufficient locality
394 * @inode: owner
395 * @ind: descriptor of indirect block.
396 *
397 * This function returns the prefered place for block allocation.
398 * It is used when heuristic for sequential allocation fails.
399 * Rules are:
400 * + if there is a block to the left of our position - allocate near it.
401 * + if pointer will live in indirect block - allocate near that block.
402 * + if pointer will live in inode - allocate in the same
403 * cylinder group.
404 *
405 * In the latter case we colour the starting block by the callers PID to
406 * prevent it from clashing with concurrent allocations for a different inode
407 * in the same block group. The PID is used here so that functionally related
408 * files will be close-by on-disk.
409 *
410 * Caller must make sure that @ind is valid and will stay that way.
411 */
412
413static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
414{
415 struct ext3_inode_info *ei = EXT3_I(inode);
416 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
417 __le32 *p;
418 unsigned long bg_start;
419 unsigned long colour;
420
421 /* Try to find previous block */
422 for (p = ind->p - 1; p >= start; p--)
423 if (*p)
424 return le32_to_cpu(*p);
425
426 /* No such thing, so let's try location of indirect block */
427 if (ind->bh)
428 return ind->bh->b_blocknr;
429
430 /*
431 * It is going to be refered from inode itself? OK, just put it into
432 * the same cylinder group then.
433 */
434 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
435 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
436 colour = (current->pid % 16) *
437 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
438 return bg_start + colour;
439}
440
441/**
442 * ext3_find_goal - find a prefered place for allocation.
443 * @inode: owner
444 * @block: block we want
445 * @chain: chain of indirect blocks
446 * @partial: pointer to the last triple within a chain
447 * @goal: place to store the result.
448 *
449 * Normally this function find the prefered place for block allocation,
fe55c452 450 * stores it in *@goal and returns zero.
1da177e4
LT
451 */
452
fe55c452
MC
453static unsigned long ext3_find_goal(struct inode *inode, long block,
454 Indirect chain[4], Indirect *partial)
1da177e4
LT
455{
456 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
457
458 /*
459 * try the heuristic for sequential allocation,
460 * failing that at least try to get decent locality.
461 */
462 if (block_i && (block == block_i->last_alloc_logical_block + 1)
463 && (block_i->last_alloc_physical_block != 0)) {
fe55c452 464 return block_i->last_alloc_physical_block + 1;
1da177e4
LT
465 }
466
fe55c452 467 return ext3_find_near(inode, partial);
1da177e4 468}
b47b2478
MC
469/**
470 * ext3_blks_to_allocate: Look up the block map and count the number
471 * of direct blocks need to be allocated for the given branch.
472 *
473 * @branch: chain of indirect blocks
474 * @k: number of blocks need for indirect blocks
475 * @blks: number of data blocks to be mapped.
476 * @blocks_to_boundary: the offset in the indirect block
477 *
478 * return the total number of blocks to be allocate, including the
479 * direct and indirect blocks.
480 */
481static int
482ext3_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
483 int blocks_to_boundary)
484{
485 unsigned long count = 0;
486
487 /*
488 * Simple case, [t,d]Indirect block(s) has not allocated yet
489 * then it's clear blocks on that path have not allocated
490 */
491 if (k > 0) {
492 /* right now don't hanel cross boundary allocation */
493 if (blks < blocks_to_boundary + 1)
494 count += blks;
495 else
496 count += blocks_to_boundary + 1;
497 return count;
498 }
499
500 count++;
501 while (count < blks && count <= blocks_to_boundary &&
502 le32_to_cpu(*(branch[0].p + count)) == 0) {
503 count++;
504 }
505 return count;
506}
507
508/**
509 * ext3_alloc_blocks: multiple allocate blocks needed for a branch
510 * @indirect_blks: the number of blocks need to allocate for indirect
511 * blocks
512 *
513 * @new_blocks: on return it will store the new block numbers for
514 * the indirect blocks(if needed) and the first direct block,
515 * @blks: on return it will store the total number of allocated
516 * direct blocks
517 */
518static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519 unsigned long goal, int indirect_blks, int blks,
520 unsigned long long new_blocks[4], int *err)
521{
522 int target, i;
523 unsigned long count = 0;
524 int index = 0;
525 unsigned long current_block = 0;
526 int ret = 0;
527
528 /*
529 * Here we try to allocate the requested multiple blocks at once,
530 * on a best-effort basis.
531 * To build a branch, we should allocate blocks for
532 * the indirect blocks(if not allocated yet), and at least
533 * the first direct block of this branch. That's the
534 * minimum number of blocks need to allocate(required)
535 */
536 target = blks + indirect_blks;
537
538 while (1) {
539 count = target;
540 /* allocating blocks for indirect blocks and direct blocks */
541 current_block = ext3_new_blocks(handle, inode, goal, &count, err);
542 if (*err)
543 goto failed_out;
544
545 target -= count;
546 /* allocate blocks for indirect blocks */
547 while (index < indirect_blks && count) {
548 new_blocks[index++] = current_block++;
549 count--;
550 }
551
552 if (count > 0)
553 break;
554 }
555
556 /* save the new block number for the first direct block */
557 new_blocks[index] = current_block;
558
559 /* total number of blocks allocated for direct blocks */
560 ret = count;
561 *err = 0;
562 return ret;
563failed_out:
564 for (i = 0; i <index; i++)
565 ext3_free_blocks(handle, inode, new_blocks[i], 1);
566 return ret;
567}
1da177e4
LT
568
569/**
570 * ext3_alloc_branch - allocate and set up a chain of blocks.
571 * @inode: owner
b47b2478
MC
572 * @indirect_blks: number of allocated indirect blocks
573 * @blks: number of allocated direct blocks
1da177e4
LT
574 * @offsets: offsets (in the blocks) to store the pointers to next.
575 * @branch: place to store the chain in.
576 *
b47b2478 577 * This function allocates blocks, zeroes out all but the last one,
1da177e4
LT
578 * links them into chain and (if we are synchronous) writes them to disk.
579 * In other words, it prepares a branch that can be spliced onto the
580 * inode. It stores the information about that chain in the branch[], in
581 * the same format as ext3_get_branch() would do. We are calling it after
582 * we had read the existing part of chain and partial points to the last
583 * triple of that (one with zero ->key). Upon the exit we have the same
5b116879 584 * picture as after the successful ext3_get_block(), except that in one
1da177e4
LT
585 * place chain is disconnected - *branch->p is still zero (we did not
586 * set the last link), but branch->key contains the number that should
587 * be placed into *branch->p to fill that gap.
588 *
589 * If allocation fails we free all blocks we've allocated (and forget
590 * their buffer_heads) and return the error value the from failed
591 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
592 * as described above and return 0.
593 */
594
595static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
b47b2478
MC
596 int indirect_blks, int *blks, unsigned long goal,
597 int *offsets, Indirect *branch)
1da177e4
LT
598{
599 int blocksize = inode->i_sb->s_blocksize;
b47b2478 600 int i, n = 0;
1da177e4 601 int err = 0;
b47b2478
MC
602 struct buffer_head *bh;
603 int num;
604 unsigned long long new_blocks[4];
605 unsigned long long current_block;
1da177e4 606
b47b2478
MC
607 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
608 *blks, new_blocks, &err);
609 if (err)
610 return err;
1da177e4 611
b47b2478
MC
612 branch[0].key = cpu_to_le32(new_blocks[0]);
613 /*
614 * metadata blocks and data blocks are allocated.
615 */
616 for (n = 1; n <= indirect_blks; n++) {
617 /*
618 * Get buffer_head for parent block, zero it out
619 * and set the pointer to new one, then send
620 * parent to disk.
621 */
622 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
623 branch[n].bh = bh;
624 lock_buffer(bh);
625 BUFFER_TRACE(bh, "call get_create_access");
626 err = ext3_journal_get_create_access(handle, bh);
627 if (err) {
1da177e4 628 unlock_buffer(bh);
b47b2478
MC
629 brelse(bh);
630 goto failed;
631 }
1da177e4 632
b47b2478
MC
633 memset(bh->b_data, 0, blocksize);
634 branch[n].p = (__le32 *) bh->b_data + offsets[n];
635 branch[n].key = cpu_to_le32(new_blocks[n]);
636 *branch[n].p = branch[n].key;
637 if ( n == indirect_blks) {
638 current_block = new_blocks[n];
639 /*
640 * End of chain, update the last new metablock of
641 * the chain to point to the new allocated
642 * data blocks numbers
643 */
644 for (i=1; i < num; i++)
645 *(branch[n].p + i) = cpu_to_le32(++current_block);
1da177e4 646 }
b47b2478
MC
647 BUFFER_TRACE(bh, "marking uptodate");
648 set_buffer_uptodate(bh);
649 unlock_buffer(bh);
1da177e4 650
b47b2478
MC
651 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
652 err = ext3_journal_dirty_metadata(handle, bh);
653 if (err)
654 goto failed;
655 }
656 *blks = num;
657 return err;
658failed:
1da177e4 659 /* Allocation failed, free what we already allocated */
b47b2478 660 for (i = 1; i <= n ; i++) {
1da177e4
LT
661 BUFFER_TRACE(branch[i].bh, "call journal_forget");
662 ext3_journal_forget(handle, branch[i].bh);
663 }
b47b2478
MC
664 for (i = 0; i <indirect_blks; i++)
665 ext3_free_blocks(handle, inode, new_blocks[i], 1);
666
667 ext3_free_blocks(handle, inode, new_blocks[i], num);
668
1da177e4
LT
669 return err;
670}
671
672/**
673 * ext3_splice_branch - splice the allocated branch onto inode.
674 * @inode: owner
675 * @block: (logical) number of block we are adding
676 * @chain: chain of indirect blocks (with a missing link - see
677 * ext3_alloc_branch)
678 * @where: location of missing link
b47b2478
MC
679 * @num: number of indirect blocks we are adding
680 * @blks: number of direct blocks we are adding
1da177e4 681 *
fe55c452 682 * This function fills the missing link and does all housekeeping needed in
1da177e4 683 * inode (->i_blocks, etc.). In case of success we end up with the full
fe55c452 684 * chain to new block and return 0.
1da177e4
LT
685 */
686
687static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
b47b2478 688 Indirect *where, int num, int blks)
1da177e4
LT
689{
690 int i;
691 int err = 0;
692 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
b47b2478 693 unsigned long current_block;
1da177e4
LT
694 /*
695 * If we're splicing into a [td]indirect block (as opposed to the
696 * inode) then we need to get write access to the [td]indirect block
697 * before the splice.
698 */
699 if (where->bh) {
700 BUFFER_TRACE(where->bh, "get_write_access");
701 err = ext3_journal_get_write_access(handle, where->bh);
702 if (err)
703 goto err_out;
704 }
1da177e4
LT
705 /* That's it */
706
707 *where->p = where->key;
b47b2478
MC
708 /* update host bufferhead or inode to point to
709 * more just allocated direct blocks blocks */
710 if (num == 0 && blks > 1) {
711 current_block = le32_to_cpu(where->key + 1);
712 for (i = 1; i < blks; i++)
713 *(where->p + i ) = cpu_to_le32(current_block++);
714 }
1da177e4
LT
715
716 /*
717 * update the most recently allocated logical & physical block
718 * in i_block_alloc_info, to assist find the proper goal block for next
719 * allocation
720 */
721 if (block_i) {
b47b2478
MC
722 block_i->last_alloc_logical_block = block + blks - 1;
723 block_i->last_alloc_physical_block = le32_to_cpu(where[num].key + blks - 1);
1da177e4
LT
724 }
725
726 /* We are done with atomic stuff, now do the rest of housekeeping */
727
728 inode->i_ctime = CURRENT_TIME_SEC;
729 ext3_mark_inode_dirty(handle, inode);
730
731 /* had we spliced it onto indirect block? */
732 if (where->bh) {
733 /*
734 * akpm: If we spliced it onto an indirect block, we haven't
735 * altered the inode. Note however that if it is being spliced
736 * onto an indirect block at the very end of the file (the
737 * file is growing) then we *will* alter the inode to reflect
738 * the new i_size. But that is not done here - it is done in
739 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
740 */
741 jbd_debug(5, "splicing indirect only\n");
742 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
743 err = ext3_journal_dirty_metadata(handle, where->bh);
744 if (err)
745 goto err_out;
746 } else {
747 /*
748 * OK, we spliced it into the inode itself on a direct block.
749 * Inode was dirtied above.
750 */
751 jbd_debug(5, "splicing direct\n");
752 }
753 return err;
754
1da177e4 755err_out:
b47b2478 756 for (i = 1; i <= num; i++) {
1da177e4
LT
757 BUFFER_TRACE(where[i].bh, "call journal_forget");
758 ext3_journal_forget(handle, where[i].bh);
b47b2478 759 ext3_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1);
1da177e4 760 }
b47b2478
MC
761 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
762
1da177e4
LT
763 return err;
764}
765
766/*
767 * Allocation strategy is simple: if we have to allocate something, we will
768 * have to go the whole way to leaf. So let's do it before attaching anything
769 * to tree, set linkage between the newborn blocks, write them if sync is
770 * required, recheck the path, free and repeat if check fails, otherwise
771 * set the last missing link (that will protect us from any truncate-generated
772 * removals - all blocks on the path are immune now) and possibly force the
773 * write on the parent block.
774 * That has a nice additional property: no special recovery from the failed
775 * allocations is needed - we simply release blocks and do not touch anything
776 * reachable from inode.
777 *
778 * akpm: `handle' can be NULL if create == 0.
779 *
780 * The BKL may not be held on entry here. Be sure to take it early.
89747d36
MC
781 * return > 0, # of blocks mapped or allocated.
782 * return = 0, if plain lookup failed.
783 * return < 0, error case.
1da177e4
LT
784 */
785
d8733c29 786int
89747d36
MC
787ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock,
788 unsigned long maxblocks, struct buffer_head *bh_result,
789 int create, int extend_disksize)
1da177e4
LT
790{
791 int err = -EIO;
792 int offsets[4];
793 Indirect chain[4];
794 Indirect *partial;
795 unsigned long goal;
b47b2478 796 int indirect_blks;
89747d36
MC
797 int blocks_to_boundary = 0;
798 int depth;
1da177e4 799 struct ext3_inode_info *ei = EXT3_I(inode);
89747d36
MC
800 int count = 0;
801 unsigned long first_block = 0;
802
1da177e4
LT
803
804 J_ASSERT(handle != NULL || create == 0);
89747d36 805 depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
1da177e4
LT
806
807 if (depth == 0)
808 goto out;
809
1da177e4
LT
810 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
811
812 /* Simplest case - block found, no allocation needed */
813 if (!partial) {
89747d36 814 first_block = chain[depth - 1].key;
1da177e4 815 clear_buffer_new(bh_result);
89747d36
MC
816 count++;
817 /*map more blocks*/
818 while (count < maxblocks && count <= blocks_to_boundary) {
819 if (!verify_chain(chain, partial)) {
820 /*
821 * Indirect block might be removed by
822 * truncate while we were reading it.
823 * Handling of that case: forget what we've
824 * got now. Flag the err as EAGAIN, so it
825 * will reread.
826 */
827 err = -EAGAIN;
828 count = 0;
829 break;
830 }
831 if (le32_to_cpu(*(chain[depth-1].p+count) ==
832 (first_block + count)))
833 count++;
834 else
835 break;
836 }
837 if (err != -EAGAIN)
838 goto got_it;
1da177e4
LT
839 }
840
841 /* Next simple case - plain lookup or failed read of indirect block */
fe55c452
MC
842 if (!create || err == -EIO)
843 goto cleanup;
844
97461518 845 mutex_lock(&ei->truncate_mutex);
fe55c452
MC
846
847 /*
848 * If the indirect block is missing while we are reading
849 * the chain(ext3_get_branch() returns -EAGAIN err), or
850 * if the chain has been changed after we grab the semaphore,
851 * (either because another process truncated this branch, or
852 * another get_block allocated this branch) re-grab the chain to see if
853 * the request block has been allocated or not.
854 *
855 * Since we already block the truncate/other get_block
856 * at this point, we will have the current copy of the chain when we
857 * splice the branch into the tree.
858 */
859 if (err == -EAGAIN || !verify_chain(chain, partial)) {
1da177e4 860 while (partial > chain) {
1da177e4
LT
861 brelse(partial->bh);
862 partial--;
863 }
fe55c452
MC
864 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
865 if (!partial) {
89747d36 866 count++;
97461518 867 mutex_unlock(&ei->truncate_mutex);
fe55c452
MC
868 if (err)
869 goto cleanup;
870 clear_buffer_new(bh_result);
871 goto got_it;
872 }
1da177e4
LT
873 }
874
875 /*
fe55c452
MC
876 * Okay, we need to do block allocation. Lazily initialize the block
877 * allocation info here if necessary
878 */
879 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
1da177e4 880 ext3_init_block_alloc_info(inode);
1da177e4 881
fe55c452 882 goal = ext3_find_goal(inode, iblock, chain, partial);
1da177e4 883
b47b2478
MC
884 /* the number of blocks need to allocate for [d,t]indirect blocks */
885 indirect_blks = (chain + depth) - partial - 1;
1da177e4 886
b47b2478
MC
887 /*
888 * Next look up the indirect map to count the totoal number of
889 * direct blocks to allocate for this branch.
890 */
891 count = ext3_blks_to_allocate(partial, indirect_blks,
892 maxblocks, blocks_to_boundary);
1da177e4
LT
893 /*
894 * Block out ext3_truncate while we alter the tree
895 */
b47b2478 896 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
fe55c452 897 offsets + (partial - chain), partial);
1da177e4 898
fe55c452
MC
899 /*
900 * The ext3_splice_branch call will free and forget any buffers
1da177e4
LT
901 * on the new chain if there is a failure, but that risks using
902 * up transaction credits, especially for bitmaps where the
903 * credits cannot be returned. Can we handle this somehow? We
fe55c452
MC
904 * may need to return -EAGAIN upwards in the worst case. --sct
905 */
1da177e4 906 if (!err)
b47b2478
MC
907 err = ext3_splice_branch(handle, inode, iblock,
908 partial, indirect_blks, count);
fe55c452 909 /*
97461518 910 * i_disksize growing is protected by truncate_mutex. Don't forget to
fe55c452
MC
911 * protect it if you're about to implement concurrent
912 * ext3_get_block() -bzzz
913 */
1da177e4
LT
914 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
915 ei->i_disksize = inode->i_size;
97461518 916 mutex_unlock(&ei->truncate_mutex);
1da177e4
LT
917 if (err)
918 goto cleanup;
919
920 set_buffer_new(bh_result);
fe55c452
MC
921got_it:
922 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
89747d36 923 if (blocks_to_boundary == 0)
fe55c452 924 set_buffer_boundary(bh_result);
89747d36 925 err = count;
fe55c452
MC
926 /* Clean up and exit */
927 partial = chain + depth - 1; /* the whole chain */
928cleanup:
1da177e4 929 while (partial > chain) {
fe55c452 930 BUFFER_TRACE(partial->bh, "call brelse");
1da177e4
LT
931 brelse(partial->bh);
932 partial--;
933 }
fe55c452
MC
934 BUFFER_TRACE(bh_result, "returned");
935out:
936 return err;
1da177e4
LT
937}
938
1da177e4
LT
939#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
940
941static int
942ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
b47b2478
MC
943 unsigned long max_blocks,
944 struct buffer_head *bh_result, int create)
1da177e4
LT
945{
946 handle_t *handle = journal_current_handle();
947 int ret = 0;
948
89747d36 949 if (!create)
1da177e4
LT
950 goto get_block; /* A read */
951
89747d36
MC
952 if (max_blocks == 1)
953 goto get_block; /* A single block get */
954
1da177e4
LT
955 if (handle->h_transaction->t_state == T_LOCKED) {
956 /*
957 * Huge direct-io writes can hold off commits for long
958 * periods of time. Let this commit run.
959 */
960 ext3_journal_stop(handle);
961 handle = ext3_journal_start(inode, DIO_CREDITS);
962 if (IS_ERR(handle))
963 ret = PTR_ERR(handle);
964 goto get_block;
965 }
966
967 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
968 /*
969 * Getting low on buffer credits...
970 */
971 ret = ext3_journal_extend(handle, DIO_CREDITS);
972 if (ret > 0) {
973 /*
974 * Couldn't extend the transaction. Start a new one.
975 */
976 ret = ext3_journal_restart(handle, DIO_CREDITS);
977 }
978 }
979
980get_block:
89747d36
MC
981 if (ret == 0) {
982 ret = ext3_get_blocks_handle(handle, inode, iblock,
983 max_blocks, bh_result, create, 0);
984 if (ret > 0) {
985 bh_result->b_size = (ret << inode->i_blkbits);
986 ret = 0;
987 }
988 }
1da177e4
LT
989 return ret;
990}
991
89747d36
MC
992static int ext3_get_blocks(struct inode *inode, sector_t iblock,
993 unsigned long maxblocks, struct buffer_head *bh_result,
994 int create)
995{
996 return ext3_direct_io_get_blocks(inode, iblock, maxblocks,
997 bh_result, create);
998}
999
1000static int ext3_get_block(struct inode *inode, sector_t iblock,
1001 struct buffer_head *bh_result, int create)
1002{
1003 return ext3_get_blocks(inode, iblock, 1, bh_result, create);
1004}
1005
1da177e4
LT
1006/*
1007 * `handle' can be NULL if create is zero
1008 */
1009struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
1010 long block, int create, int * errp)
1011{
1012 struct buffer_head dummy;
1013 int fatal = 0, err;
1014
1015 J_ASSERT(handle != NULL || create == 0);
1016
1017 dummy.b_state = 0;
1018 dummy.b_blocknr = -1000;
1019 buffer_trace_init(&dummy.b_history);
89747d36
MC
1020 err = ext3_get_blocks_handle(handle, inode, block, 1,
1021 &dummy, create, 1);
1022 if (err == 1) {
1023 err = 0;
1024 } else if (err >= 0) {
1025 WARN_ON(1);
1026 err = -EIO;
1027 }
1028 *errp = err;
1029 if (!err && buffer_mapped(&dummy)) {
1da177e4
LT
1030 struct buffer_head *bh;
1031 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
2973dfdb
GOC
1032 if (!bh) {
1033 *errp = -EIO;
1034 goto err;
1035 }
1da177e4
LT
1036 if (buffer_new(&dummy)) {
1037 J_ASSERT(create != 0);
1038 J_ASSERT(handle != 0);
1039
1040 /* Now that we do not always journal data, we
1041 should keep in mind whether this should
1042 always journal the new buffer as metadata.
1043 For now, regular file writes use
1044 ext3_get_block instead, so it's not a
1045 problem. */
1046 lock_buffer(bh);
1047 BUFFER_TRACE(bh, "call get_create_access");
1048 fatal = ext3_journal_get_create_access(handle, bh);
1049 if (!fatal && !buffer_uptodate(bh)) {
1050 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1051 set_buffer_uptodate(bh);
1052 }
1053 unlock_buffer(bh);
1054 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1055 err = ext3_journal_dirty_metadata(handle, bh);
1056 if (!fatal)
1057 fatal = err;
1058 } else {
1059 BUFFER_TRACE(bh, "not a new buffer");
1060 }
1061 if (fatal) {
1062 *errp = fatal;
1063 brelse(bh);
1064 bh = NULL;
1065 }
1066 return bh;
1067 }
2973dfdb 1068err:
1da177e4
LT
1069 return NULL;
1070}
1071
1072struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
1073 int block, int create, int *err)
1074{
1075 struct buffer_head * bh;
1076
1077 bh = ext3_getblk(handle, inode, block, create, err);
1078 if (!bh)
1079 return bh;
1080 if (buffer_uptodate(bh))
1081 return bh;
1082 ll_rw_block(READ, 1, &bh);
1083 wait_on_buffer(bh);
1084 if (buffer_uptodate(bh))
1085 return bh;
1086 put_bh(bh);
1087 *err = -EIO;
1088 return NULL;
1089}
1090
1091static int walk_page_buffers( handle_t *handle,
1092 struct buffer_head *head,
1093 unsigned from,
1094 unsigned to,
1095 int *partial,
1096 int (*fn)( handle_t *handle,
1097 struct buffer_head *bh))
1098{
1099 struct buffer_head *bh;
1100 unsigned block_start, block_end;
1101 unsigned blocksize = head->b_size;
1102 int err, ret = 0;
1103 struct buffer_head *next;
1104
1105 for ( bh = head, block_start = 0;
1106 ret == 0 && (bh != head || !block_start);
1107 block_start = block_end, bh = next)
1108 {
1109 next = bh->b_this_page;
1110 block_end = block_start + blocksize;
1111 if (block_end <= from || block_start >= to) {
1112 if (partial && !buffer_uptodate(bh))
1113 *partial = 1;
1114 continue;
1115 }
1116 err = (*fn)(handle, bh);
1117 if (!ret)
1118 ret = err;
1119 }
1120 return ret;
1121}
1122
1123/*
1124 * To preserve ordering, it is essential that the hole instantiation and
1125 * the data write be encapsulated in a single transaction. We cannot
1126 * close off a transaction and start a new one between the ext3_get_block()
1127 * and the commit_write(). So doing the journal_start at the start of
1128 * prepare_write() is the right place.
1129 *
1130 * Also, this function can nest inside ext3_writepage() ->
1131 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1132 * has generated enough buffer credits to do the whole page. So we won't
1133 * block on the journal in that case, which is good, because the caller may
1134 * be PF_MEMALLOC.
1135 *
1136 * By accident, ext3 can be reentered when a transaction is open via
1137 * quota file writes. If we were to commit the transaction while thus
1138 * reentered, there can be a deadlock - we would be holding a quota
1139 * lock, and the commit would never complete if another thread had a
1140 * transaction open and was blocking on the quota lock - a ranking
1141 * violation.
1142 *
1143 * So what we do is to rely on the fact that journal_stop/journal_start
1144 * will _not_ run commit under these circumstances because handle->h_ref
1145 * is elevated. We'll still have enough credits for the tiny quotafile
1146 * write.
1147 */
1148
1149static int do_journal_get_write_access(handle_t *handle,
1150 struct buffer_head *bh)
1151{
1152 if (!buffer_mapped(bh) || buffer_freed(bh))
1153 return 0;
1154 return ext3_journal_get_write_access(handle, bh);
1155}
1156
1157static int ext3_prepare_write(struct file *file, struct page *page,
1158 unsigned from, unsigned to)
1159{
1160 struct inode *inode = page->mapping->host;
1161 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1162 handle_t *handle;
1163 int retries = 0;
1164
1165retry:
1166 handle = ext3_journal_start(inode, needed_blocks);
1167 if (IS_ERR(handle)) {
1168 ret = PTR_ERR(handle);
1169 goto out;
1170 }
1171 if (test_opt(inode->i_sb, NOBH))
1172 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1173 else
1174 ret = block_prepare_write(page, from, to, ext3_get_block);
1175 if (ret)
1176 goto prepare_write_failed;
1177
1178 if (ext3_should_journal_data(inode)) {
1179 ret = walk_page_buffers(handle, page_buffers(page),
1180 from, to, NULL, do_journal_get_write_access);
1181 }
1182prepare_write_failed:
1183 if (ret)
1184 ext3_journal_stop(handle);
1185 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1186 goto retry;
1187out:
1188 return ret;
1189}
1190
1191int
1192ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1193{
1194 int err = journal_dirty_data(handle, bh);
1195 if (err)
1196 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1197 bh, handle,err);
1198 return err;
1199}
1200
1201/* For commit_write() in data=journal mode */
1202static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1203{
1204 if (!buffer_mapped(bh) || buffer_freed(bh))
1205 return 0;
1206 set_buffer_uptodate(bh);
1207 return ext3_journal_dirty_metadata(handle, bh);
1208}
1209
1210/*
1211 * We need to pick up the new inode size which generic_commit_write gave us
1212 * `file' can be NULL - eg, when called from page_symlink().
1213 *
1214 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1215 * buffers are managed internally.
1216 */
1217
1218static int ext3_ordered_commit_write(struct file *file, struct page *page,
1219 unsigned from, unsigned to)
1220{
1221 handle_t *handle = ext3_journal_current_handle();
1222 struct inode *inode = page->mapping->host;
1223 int ret = 0, ret2;
1224
1225 ret = walk_page_buffers(handle, page_buffers(page),
1226 from, to, NULL, ext3_journal_dirty_data);
1227
1228 if (ret == 0) {
1229 /*
1230 * generic_commit_write() will run mark_inode_dirty() if i_size
1231 * changes. So let's piggyback the i_disksize mark_inode_dirty
1232 * into that.
1233 */
1234 loff_t new_i_size;
1235
1236 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1237 if (new_i_size > EXT3_I(inode)->i_disksize)
1238 EXT3_I(inode)->i_disksize = new_i_size;
1239 ret = generic_commit_write(file, page, from, to);
1240 }
1241 ret2 = ext3_journal_stop(handle);
1242 if (!ret)
1243 ret = ret2;
1244 return ret;
1245}
1246
1247static int ext3_writeback_commit_write(struct file *file, struct page *page,
1248 unsigned from, unsigned to)
1249{
1250 handle_t *handle = ext3_journal_current_handle();
1251 struct inode *inode = page->mapping->host;
1252 int ret = 0, ret2;
1253 loff_t new_i_size;
1254
1255 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1256 if (new_i_size > EXT3_I(inode)->i_disksize)
1257 EXT3_I(inode)->i_disksize = new_i_size;
1258
1259 if (test_opt(inode->i_sb, NOBH))
1260 ret = nobh_commit_write(file, page, from, to);
1261 else
1262 ret = generic_commit_write(file, page, from, to);
1263
1264 ret2 = ext3_journal_stop(handle);
1265 if (!ret)
1266 ret = ret2;
1267 return ret;
1268}
1269
1270static int ext3_journalled_commit_write(struct file *file,
1271 struct page *page, unsigned from, unsigned to)
1272{
1273 handle_t *handle = ext3_journal_current_handle();
1274 struct inode *inode = page->mapping->host;
1275 int ret = 0, ret2;
1276 int partial = 0;
1277 loff_t pos;
1278
1279 /*
1280 * Here we duplicate the generic_commit_write() functionality
1281 */
1282 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1283
1284 ret = walk_page_buffers(handle, page_buffers(page), from,
1285 to, &partial, commit_write_fn);
1286 if (!partial)
1287 SetPageUptodate(page);
1288 if (pos > inode->i_size)
1289 i_size_write(inode, pos);
1290 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1291 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1292 EXT3_I(inode)->i_disksize = inode->i_size;
1293 ret2 = ext3_mark_inode_dirty(handle, inode);
1294 if (!ret)
1295 ret = ret2;
1296 }
1297 ret2 = ext3_journal_stop(handle);
1298 if (!ret)
1299 ret = ret2;
1300 return ret;
1301}
1302
1303/*
1304 * bmap() is special. It gets used by applications such as lilo and by
1305 * the swapper to find the on-disk block of a specific piece of data.
1306 *
1307 * Naturally, this is dangerous if the block concerned is still in the
1308 * journal. If somebody makes a swapfile on an ext3 data-journaling
1309 * filesystem and enables swap, then they may get a nasty shock when the
1310 * data getting swapped to that swapfile suddenly gets overwritten by
1311 * the original zero's written out previously to the journal and
1312 * awaiting writeback in the kernel's buffer cache.
1313 *
1314 * So, if we see any bmap calls here on a modified, data-journaled file,
1315 * take extra steps to flush any blocks which might be in the cache.
1316 */
1317static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1318{
1319 struct inode *inode = mapping->host;
1320 journal_t *journal;
1321 int err;
1322
1323 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1324 /*
1325 * This is a REALLY heavyweight approach, but the use of
1326 * bmap on dirty files is expected to be extremely rare:
1327 * only if we run lilo or swapon on a freshly made file
1328 * do we expect this to happen.
1329 *
1330 * (bmap requires CAP_SYS_RAWIO so this does not
1331 * represent an unprivileged user DOS attack --- we'd be
1332 * in trouble if mortal users could trigger this path at
1333 * will.)
1334 *
1335 * NB. EXT3_STATE_JDATA is not set on files other than
1336 * regular files. If somebody wants to bmap a directory
1337 * or symlink and gets confused because the buffer
1338 * hasn't yet been flushed to disk, they deserve
1339 * everything they get.
1340 */
1341
1342 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1343 journal = EXT3_JOURNAL(inode);
1344 journal_lock_updates(journal);
1345 err = journal_flush(journal);
1346 journal_unlock_updates(journal);
1347
1348 if (err)
1349 return 0;
1350 }
1351
1352 return generic_block_bmap(mapping,block,ext3_get_block);
1353}
1354
1355static int bget_one(handle_t *handle, struct buffer_head *bh)
1356{
1357 get_bh(bh);
1358 return 0;
1359}
1360
1361static int bput_one(handle_t *handle, struct buffer_head *bh)
1362{
1363 put_bh(bh);
1364 return 0;
1365}
1366
1367static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1368{
1369 if (buffer_mapped(bh))
1370 return ext3_journal_dirty_data(handle, bh);
1371 return 0;
1372}
1373
1374/*
1375 * Note that we always start a transaction even if we're not journalling
1376 * data. This is to preserve ordering: any hole instantiation within
1377 * __block_write_full_page -> ext3_get_block() should be journalled
1378 * along with the data so we don't crash and then get metadata which
1379 * refers to old data.
1380 *
1381 * In all journalling modes block_write_full_page() will start the I/O.
1382 *
1383 * Problem:
1384 *
1385 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1386 * ext3_writepage()
1387 *
1388 * Similar for:
1389 *
1390 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1391 *
1392 * Same applies to ext3_get_block(). We will deadlock on various things like
97461518 1393 * lock_journal and i_truncate_mutex.
1da177e4
LT
1394 *
1395 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1396 * allocations fail.
1397 *
1398 * 16May01: If we're reentered then journal_current_handle() will be
1399 * non-zero. We simply *return*.
1400 *
1401 * 1 July 2001: @@@ FIXME:
1402 * In journalled data mode, a data buffer may be metadata against the
1403 * current transaction. But the same file is part of a shared mapping
1404 * and someone does a writepage() on it.
1405 *
1406 * We will move the buffer onto the async_data list, but *after* it has
1407 * been dirtied. So there's a small window where we have dirty data on
1408 * BJ_Metadata.
1409 *
1410 * Note that this only applies to the last partial page in the file. The
1411 * bit which block_write_full_page() uses prepare/commit for. (That's
1412 * broken code anyway: it's wrong for msync()).
1413 *
1414 * It's a rare case: affects the final partial page, for journalled data
1415 * where the file is subject to bith write() and writepage() in the same
1416 * transction. To fix it we'll need a custom block_write_full_page().
1417 * We'll probably need that anyway for journalling writepage() output.
1418 *
1419 * We don't honour synchronous mounts for writepage(). That would be
1420 * disastrous. Any write() or metadata operation will sync the fs for
1421 * us.
1422 *
1423 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1424 * we don't need to open a transaction here.
1425 */
1426static int ext3_ordered_writepage(struct page *page,
1427 struct writeback_control *wbc)
1428{
1429 struct inode *inode = page->mapping->host;
1430 struct buffer_head *page_bufs;
1431 handle_t *handle = NULL;
1432 int ret = 0;
1433 int err;
1434
1435 J_ASSERT(PageLocked(page));
1436
1437 /*
1438 * We give up here if we're reentered, because it might be for a
1439 * different filesystem.
1440 */
1441 if (ext3_journal_current_handle())
1442 goto out_fail;
1443
1444 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1445
1446 if (IS_ERR(handle)) {
1447 ret = PTR_ERR(handle);
1448 goto out_fail;
1449 }
1450
1451 if (!page_has_buffers(page)) {
1452 create_empty_buffers(page, inode->i_sb->s_blocksize,
1453 (1 << BH_Dirty)|(1 << BH_Uptodate));
1454 }
1455 page_bufs = page_buffers(page);
1456 walk_page_buffers(handle, page_bufs, 0,
1457 PAGE_CACHE_SIZE, NULL, bget_one);
1458
1459 ret = block_write_full_page(page, ext3_get_block, wbc);
1460
1461 /*
1462 * The page can become unlocked at any point now, and
1463 * truncate can then come in and change things. So we
1464 * can't touch *page from now on. But *page_bufs is
1465 * safe due to elevated refcount.
1466 */
1467
1468 /*
1469 * And attach them to the current transaction. But only if
1470 * block_write_full_page() succeeded. Otherwise they are unmapped,
1471 * and generally junk.
1472 */
1473 if (ret == 0) {
1474 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1475 NULL, journal_dirty_data_fn);
1476 if (!ret)
1477 ret = err;
1478 }
1479 walk_page_buffers(handle, page_bufs, 0,
1480 PAGE_CACHE_SIZE, NULL, bput_one);
1481 err = ext3_journal_stop(handle);
1482 if (!ret)
1483 ret = err;
1484 return ret;
1485
1486out_fail:
1487 redirty_page_for_writepage(wbc, page);
1488 unlock_page(page);
1489 return ret;
1490}
1491
1da177e4
LT
1492static int ext3_writeback_writepage(struct page *page,
1493 struct writeback_control *wbc)
1494{
1495 struct inode *inode = page->mapping->host;
1496 handle_t *handle = NULL;
1497 int ret = 0;
1498 int err;
1499
1500 if (ext3_journal_current_handle())
1501 goto out_fail;
1502
1503 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1504 if (IS_ERR(handle)) {
1505 ret = PTR_ERR(handle);
1506 goto out_fail;
1507 }
1508
1509 if (test_opt(inode->i_sb, NOBH))
1510 ret = nobh_writepage(page, ext3_get_block, wbc);
1511 else
1512 ret = block_write_full_page(page, ext3_get_block, wbc);
1513
1514 err = ext3_journal_stop(handle);
1515 if (!ret)
1516 ret = err;
1517 return ret;
1518
1519out_fail:
1520 redirty_page_for_writepage(wbc, page);
1521 unlock_page(page);
1522 return ret;
1523}
1524
1525static int ext3_journalled_writepage(struct page *page,
1526 struct writeback_control *wbc)
1527{
1528 struct inode *inode = page->mapping->host;
1529 handle_t *handle = NULL;
1530 int ret = 0;
1531 int err;
1532
1533 if (ext3_journal_current_handle())
1534 goto no_write;
1535
1536 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1537 if (IS_ERR(handle)) {
1538 ret = PTR_ERR(handle);
1539 goto no_write;
1540 }
1541
1542 if (!page_has_buffers(page) || PageChecked(page)) {
1543 /*
1544 * It's mmapped pagecache. Add buffers and journal it. There
1545 * doesn't seem much point in redirtying the page here.
1546 */
1547 ClearPageChecked(page);
1548 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1549 ext3_get_block);
ab4eb43c
DL
1550 if (ret != 0) {
1551 ext3_journal_stop(handle);
1da177e4 1552 goto out_unlock;
ab4eb43c 1553 }
1da177e4
LT
1554 ret = walk_page_buffers(handle, page_buffers(page), 0,
1555 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1556
1557 err = walk_page_buffers(handle, page_buffers(page), 0,
1558 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1559 if (ret == 0)
1560 ret = err;
1561 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1562 unlock_page(page);
1563 } else {
1564 /*
1565 * It may be a page full of checkpoint-mode buffers. We don't
1566 * really know unless we go poke around in the buffer_heads.
1567 * But block_write_full_page will do the right thing.
1568 */
1569 ret = block_write_full_page(page, ext3_get_block, wbc);
1570 }
1571 err = ext3_journal_stop(handle);
1572 if (!ret)
1573 ret = err;
1574out:
1575 return ret;
1576
1577no_write:
1578 redirty_page_for_writepage(wbc, page);
1579out_unlock:
1580 unlock_page(page);
1581 goto out;
1582}
1583
1584static int ext3_readpage(struct file *file, struct page *page)
1585{
1586 return mpage_readpage(page, ext3_get_block);
1587}
1588
1589static int
1590ext3_readpages(struct file *file, struct address_space *mapping,
1591 struct list_head *pages, unsigned nr_pages)
1592{
1593 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1594}
1595
2ff28e22 1596static void ext3_invalidatepage(struct page *page, unsigned long offset)
1da177e4
LT
1597{
1598 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1599
1600 /*
1601 * If it's a full truncate we just forget about the pending dirtying
1602 */
1603 if (offset == 0)
1604 ClearPageChecked(page);
1605
2ff28e22 1606 journal_invalidatepage(journal, page, offset);
1da177e4
LT
1607}
1608
27496a8c 1609static int ext3_releasepage(struct page *page, gfp_t wait)
1da177e4
LT
1610{
1611 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1612
1613 WARN_ON(PageChecked(page));
1614 if (!page_has_buffers(page))
1615 return 0;
1616 return journal_try_to_free_buffers(journal, page, wait);
1617}
1618
1619/*
1620 * If the O_DIRECT write will extend the file then add this inode to the
1621 * orphan list. So recovery will truncate it back to the original size
1622 * if the machine crashes during the write.
1623 *
1624 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1625 * crashes then stale disk data _may_ be exposed inside the file.
1626 */
1627static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1628 const struct iovec *iov, loff_t offset,
1629 unsigned long nr_segs)
1630{
1631 struct file *file = iocb->ki_filp;
1632 struct inode *inode = file->f_mapping->host;
1633 struct ext3_inode_info *ei = EXT3_I(inode);
1634 handle_t *handle = NULL;
1635 ssize_t ret;
1636 int orphan = 0;
1637 size_t count = iov_length(iov, nr_segs);
1638
1639 if (rw == WRITE) {
1640 loff_t final_size = offset + count;
1641
1642 handle = ext3_journal_start(inode, DIO_CREDITS);
1643 if (IS_ERR(handle)) {
1644 ret = PTR_ERR(handle);
1645 goto out;
1646 }
1647 if (final_size > inode->i_size) {
1648 ret = ext3_orphan_add(handle, inode);
1649 if (ret)
1650 goto out_stop;
1651 orphan = 1;
1652 ei->i_disksize = inode->i_size;
1653 }
1654 }
1655
1656 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1657 offset, nr_segs,
1658 ext3_direct_io_get_blocks, NULL);
1659
1660 /*
1661 * Reacquire the handle: ext3_direct_io_get_block() can restart the
1662 * transaction
1663 */
1664 handle = journal_current_handle();
1665
1666out_stop:
1667 if (handle) {
1668 int err;
1669
1670 if (orphan && inode->i_nlink)
1671 ext3_orphan_del(handle, inode);
1672 if (orphan && ret > 0) {
1673 loff_t end = offset + ret;
1674 if (end > inode->i_size) {
1675 ei->i_disksize = end;
1676 i_size_write(inode, end);
1677 /*
1678 * We're going to return a positive `ret'
1679 * here due to non-zero-length I/O, so there's
1680 * no way of reporting error returns from
1681 * ext3_mark_inode_dirty() to userspace. So
1682 * ignore it.
1683 */
1684 ext3_mark_inode_dirty(handle, inode);
1685 }
1686 }
1687 err = ext3_journal_stop(handle);
1688 if (ret == 0)
1689 ret = err;
1690 }
1691out:
1692 return ret;
1693}
1694
1695/*
1696 * Pages can be marked dirty completely asynchronously from ext3's journalling
1697 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1698 * much here because ->set_page_dirty is called under VFS locks. The page is
1699 * not necessarily locked.
1700 *
1701 * We cannot just dirty the page and leave attached buffers clean, because the
1702 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1703 * or jbddirty because all the journalling code will explode.
1704 *
1705 * So what we do is to mark the page "pending dirty" and next time writepage
1706 * is called, propagate that into the buffers appropriately.
1707 */
1708static int ext3_journalled_set_page_dirty(struct page *page)
1709{
1710 SetPageChecked(page);
1711 return __set_page_dirty_nobuffers(page);
1712}
1713
1714static struct address_space_operations ext3_ordered_aops = {
1715 .readpage = ext3_readpage,
1716 .readpages = ext3_readpages,
1717 .writepage = ext3_ordered_writepage,
1718 .sync_page = block_sync_page,
1719 .prepare_write = ext3_prepare_write,
1720 .commit_write = ext3_ordered_commit_write,
1721 .bmap = ext3_bmap,
1722 .invalidatepage = ext3_invalidatepage,
1723 .releasepage = ext3_releasepage,
1724 .direct_IO = ext3_direct_IO,
e965f963 1725 .migratepage = buffer_migrate_page,
1da177e4
LT
1726};
1727
1728static struct address_space_operations ext3_writeback_aops = {
1729 .readpage = ext3_readpage,
1730 .readpages = ext3_readpages,
1731 .writepage = ext3_writeback_writepage,
1da177e4
LT
1732 .sync_page = block_sync_page,
1733 .prepare_write = ext3_prepare_write,
1734 .commit_write = ext3_writeback_commit_write,
1735 .bmap = ext3_bmap,
1736 .invalidatepage = ext3_invalidatepage,
1737 .releasepage = ext3_releasepage,
1738 .direct_IO = ext3_direct_IO,
e965f963 1739 .migratepage = buffer_migrate_page,
1da177e4
LT
1740};
1741
1742static struct address_space_operations ext3_journalled_aops = {
1743 .readpage = ext3_readpage,
1744 .readpages = ext3_readpages,
1745 .writepage = ext3_journalled_writepage,
1746 .sync_page = block_sync_page,
1747 .prepare_write = ext3_prepare_write,
1748 .commit_write = ext3_journalled_commit_write,
1749 .set_page_dirty = ext3_journalled_set_page_dirty,
1750 .bmap = ext3_bmap,
1751 .invalidatepage = ext3_invalidatepage,
1752 .releasepage = ext3_releasepage,
1753};
1754
1755void ext3_set_aops(struct inode *inode)
1756{
1757 if (ext3_should_order_data(inode))
1758 inode->i_mapping->a_ops = &ext3_ordered_aops;
1759 else if (ext3_should_writeback_data(inode))
1760 inode->i_mapping->a_ops = &ext3_writeback_aops;
1761 else
1762 inode->i_mapping->a_ops = &ext3_journalled_aops;
1763}
1764
1765/*
1766 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1767 * up to the end of the block which corresponds to `from'.
1768 * This required during truncate. We need to physically zero the tail end
1769 * of that block so it doesn't yield old data if the file is later grown.
1770 */
1771static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1772 struct address_space *mapping, loff_t from)
1773{
1774 unsigned long index = from >> PAGE_CACHE_SHIFT;
1775 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1776 unsigned blocksize, iblock, length, pos;
1777 struct inode *inode = mapping->host;
1778 struct buffer_head *bh;
1779 int err = 0;
1780 void *kaddr;
1781
1782 blocksize = inode->i_sb->s_blocksize;
1783 length = blocksize - (offset & (blocksize - 1));
1784 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1785
1786 /*
1787 * For "nobh" option, we can only work if we don't need to
1788 * read-in the page - otherwise we create buffers to do the IO.
1789 */
cd6ef84e
BP
1790 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1791 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1792 kaddr = kmap_atomic(page, KM_USER0);
1793 memset(kaddr + offset, 0, length);
1794 flush_dcache_page(page);
1795 kunmap_atomic(kaddr, KM_USER0);
1796 set_page_dirty(page);
1797 goto unlock;
1da177e4
LT
1798 }
1799
1800 if (!page_has_buffers(page))
1801 create_empty_buffers(page, blocksize, 0);
1802
1803 /* Find the buffer that contains "offset" */
1804 bh = page_buffers(page);
1805 pos = blocksize;
1806 while (offset >= pos) {
1807 bh = bh->b_this_page;
1808 iblock++;
1809 pos += blocksize;
1810 }
1811
1812 err = 0;
1813 if (buffer_freed(bh)) {
1814 BUFFER_TRACE(bh, "freed: skip");
1815 goto unlock;
1816 }
1817
1818 if (!buffer_mapped(bh)) {
1819 BUFFER_TRACE(bh, "unmapped");
1820 ext3_get_block(inode, iblock, bh, 0);
1821 /* unmapped? It's a hole - nothing to do */
1822 if (!buffer_mapped(bh)) {
1823 BUFFER_TRACE(bh, "still unmapped");
1824 goto unlock;
1825 }
1826 }
1827
1828 /* Ok, it's mapped. Make sure it's up-to-date */
1829 if (PageUptodate(page))
1830 set_buffer_uptodate(bh);
1831
1832 if (!buffer_uptodate(bh)) {
1833 err = -EIO;
1834 ll_rw_block(READ, 1, &bh);
1835 wait_on_buffer(bh);
1836 /* Uhhuh. Read error. Complain and punt. */
1837 if (!buffer_uptodate(bh))
1838 goto unlock;
1839 }
1840
1841 if (ext3_should_journal_data(inode)) {
1842 BUFFER_TRACE(bh, "get write access");
1843 err = ext3_journal_get_write_access(handle, bh);
1844 if (err)
1845 goto unlock;
1846 }
1847
1848 kaddr = kmap_atomic(page, KM_USER0);
1849 memset(kaddr + offset, 0, length);
1850 flush_dcache_page(page);
1851 kunmap_atomic(kaddr, KM_USER0);
1852
1853 BUFFER_TRACE(bh, "zeroed end of block");
1854
1855 err = 0;
1856 if (ext3_should_journal_data(inode)) {
1857 err = ext3_journal_dirty_metadata(handle, bh);
1858 } else {
1859 if (ext3_should_order_data(inode))
1860 err = ext3_journal_dirty_data(handle, bh);
1861 mark_buffer_dirty(bh);
1862 }
1863
1864unlock:
1865 unlock_page(page);
1866 page_cache_release(page);
1867 return err;
1868}
1869
1870/*
1871 * Probably it should be a library function... search for first non-zero word
1872 * or memcmp with zero_page, whatever is better for particular architecture.
1873 * Linus?
1874 */
1875static inline int all_zeroes(__le32 *p, __le32 *q)
1876{
1877 while (p < q)
1878 if (*p++)
1879 return 0;
1880 return 1;
1881}
1882
1883/**
1884 * ext3_find_shared - find the indirect blocks for partial truncation.
1885 * @inode: inode in question
1886 * @depth: depth of the affected branch
1887 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1888 * @chain: place to store the pointers to partial indirect blocks
1889 * @top: place to the (detached) top of branch
1890 *
1891 * This is a helper function used by ext3_truncate().
1892 *
1893 * When we do truncate() we may have to clean the ends of several
1894 * indirect blocks but leave the blocks themselves alive. Block is
1895 * partially truncated if some data below the new i_size is refered
1896 * from it (and it is on the path to the first completely truncated
1897 * data block, indeed). We have to free the top of that path along
1898 * with everything to the right of the path. Since no allocation
1899 * past the truncation point is possible until ext3_truncate()
1900 * finishes, we may safely do the latter, but top of branch may
1901 * require special attention - pageout below the truncation point
1902 * might try to populate it.
1903 *
1904 * We atomically detach the top of branch from the tree, store the
1905 * block number of its root in *@top, pointers to buffer_heads of
1906 * partially truncated blocks - in @chain[].bh and pointers to
1907 * their last elements that should not be removed - in
1908 * @chain[].p. Return value is the pointer to last filled element
1909 * of @chain.
1910 *
1911 * The work left to caller to do the actual freeing of subtrees:
1912 * a) free the subtree starting from *@top
1913 * b) free the subtrees whose roots are stored in
1914 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1915 * c) free the subtrees growing from the inode past the @chain[0].
1916 * (no partially truncated stuff there). */
1917
1918static Indirect *ext3_find_shared(struct inode *inode,
1919 int depth,
1920 int offsets[4],
1921 Indirect chain[4],
1922 __le32 *top)
1923{
1924 Indirect *partial, *p;
1925 int k, err;
1926
1927 *top = 0;
1928 /* Make k index the deepest non-null offest + 1 */
1929 for (k = depth; k > 1 && !offsets[k-1]; k--)
1930 ;
1931 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1932 /* Writer: pointers */
1933 if (!partial)
1934 partial = chain + k-1;
1935 /*
1936 * If the branch acquired continuation since we've looked at it -
1937 * fine, it should all survive and (new) top doesn't belong to us.
1938 */
1939 if (!partial->key && *partial->p)
1940 /* Writer: end */
1941 goto no_top;
1942 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1943 ;
1944 /*
1945 * OK, we've found the last block that must survive. The rest of our
1946 * branch should be detached before unlocking. However, if that rest
1947 * of branch is all ours and does not grow immediately from the inode
1948 * it's easier to cheat and just decrement partial->p.
1949 */
1950 if (p == chain + k - 1 && p > chain) {
1951 p->p--;
1952 } else {
1953 *top = *p->p;
1954 /* Nope, don't do this in ext3. Must leave the tree intact */
1955#if 0
1956 *p->p = 0;
1957#endif
1958 }
1959 /* Writer: end */
1960
1961 while(partial > p)
1962 {
1963 brelse(partial->bh);
1964 partial--;
1965 }
1966no_top:
1967 return partial;
1968}
1969
1970/*
1971 * Zero a number of block pointers in either an inode or an indirect block.
1972 * If we restart the transaction we must again get write access to the
1973 * indirect block for further modification.
1974 *
1975 * We release `count' blocks on disk, but (last - first) may be greater
1976 * than `count' because there can be holes in there.
1977 */
1978static void
1979ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1980 unsigned long block_to_free, unsigned long count,
1981 __le32 *first, __le32 *last)
1982{
1983 __le32 *p;
1984 if (try_to_extend_transaction(handle, inode)) {
1985 if (bh) {
1986 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1987 ext3_journal_dirty_metadata(handle, bh);
1988 }
1989 ext3_mark_inode_dirty(handle, inode);
1990 ext3_journal_test_restart(handle, inode);
1991 if (bh) {
1992 BUFFER_TRACE(bh, "retaking write access");
1993 ext3_journal_get_write_access(handle, bh);
1994 }
1995 }
1996
1997 /*
1998 * Any buffers which are on the journal will be in memory. We find
1999 * them on the hash table so journal_revoke() will run journal_forget()
2000 * on them. We've already detached each block from the file, so
2001 * bforget() in journal_forget() should be safe.
2002 *
2003 * AKPM: turn on bforget in journal_forget()!!!
2004 */
2005 for (p = first; p < last; p++) {
2006 u32 nr = le32_to_cpu(*p);
2007 if (nr) {
2008 struct buffer_head *bh;
2009
2010 *p = 0;
2011 bh = sb_find_get_block(inode->i_sb, nr);
2012 ext3_forget(handle, 0, inode, bh, nr);
2013 }
2014 }
2015
2016 ext3_free_blocks(handle, inode, block_to_free, count);
2017}
2018
2019/**
2020 * ext3_free_data - free a list of data blocks
2021 * @handle: handle for this transaction
2022 * @inode: inode we are dealing with
2023 * @this_bh: indirect buffer_head which contains *@first and *@last
2024 * @first: array of block numbers
2025 * @last: points immediately past the end of array
2026 *
2027 * We are freeing all blocks refered from that array (numbers are stored as
2028 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2029 *
2030 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2031 * blocks are contiguous then releasing them at one time will only affect one
2032 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2033 * actually use a lot of journal space.
2034 *
2035 * @this_bh will be %NULL if @first and @last point into the inode's direct
2036 * block pointers.
2037 */
2038static void ext3_free_data(handle_t *handle, struct inode *inode,
2039 struct buffer_head *this_bh,
2040 __le32 *first, __le32 *last)
2041{
2042 unsigned long block_to_free = 0; /* Starting block # of a run */
2043 unsigned long count = 0; /* Number of blocks in the run */
2044 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2045 corresponding to
2046 block_to_free */
2047 unsigned long nr; /* Current block # */
2048 __le32 *p; /* Pointer into inode/ind
2049 for current block */
2050 int err;
2051
2052 if (this_bh) { /* For indirect block */
2053 BUFFER_TRACE(this_bh, "get_write_access");
2054 err = ext3_journal_get_write_access(handle, this_bh);
2055 /* Important: if we can't update the indirect pointers
2056 * to the blocks, we can't free them. */
2057 if (err)
2058 return;
2059 }
2060
2061 for (p = first; p < last; p++) {
2062 nr = le32_to_cpu(*p);
2063 if (nr) {
2064 /* accumulate blocks to free if they're contiguous */
2065 if (count == 0) {
2066 block_to_free = nr;
2067 block_to_free_p = p;
2068 count = 1;
2069 } else if (nr == block_to_free + count) {
2070 count++;
2071 } else {
2072 ext3_clear_blocks(handle, inode, this_bh,
2073 block_to_free,
2074 count, block_to_free_p, p);
2075 block_to_free = nr;
2076 block_to_free_p = p;
2077 count = 1;
2078 }
2079 }
2080 }
2081
2082 if (count > 0)
2083 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2084 count, block_to_free_p, p);
2085
2086 if (this_bh) {
2087 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2088 ext3_journal_dirty_metadata(handle, this_bh);
2089 }
2090}
2091
2092/**
2093 * ext3_free_branches - free an array of branches
2094 * @handle: JBD handle for this transaction
2095 * @inode: inode we are dealing with
2096 * @parent_bh: the buffer_head which contains *@first and *@last
2097 * @first: array of block numbers
2098 * @last: pointer immediately past the end of array
2099 * @depth: depth of the branches to free
2100 *
2101 * We are freeing all blocks refered from these branches (numbers are
2102 * stored as little-endian 32-bit) and updating @inode->i_blocks
2103 * appropriately.
2104 */
2105static void ext3_free_branches(handle_t *handle, struct inode *inode,
2106 struct buffer_head *parent_bh,
2107 __le32 *first, __le32 *last, int depth)
2108{
2109 unsigned long nr;
2110 __le32 *p;
2111
2112 if (is_handle_aborted(handle))
2113 return;
2114
2115 if (depth--) {
2116 struct buffer_head *bh;
2117 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2118 p = last;
2119 while (--p >= first) {
2120 nr = le32_to_cpu(*p);
2121 if (!nr)
2122 continue; /* A hole */
2123
2124 /* Go read the buffer for the next level down */
2125 bh = sb_bread(inode->i_sb, nr);
2126
2127 /*
2128 * A read failure? Report error and clear slot
2129 * (should be rare).
2130 */
2131 if (!bh) {
2132 ext3_error(inode->i_sb, "ext3_free_branches",
2133 "Read failure, inode=%ld, block=%ld",
2134 inode->i_ino, nr);
2135 continue;
2136 }
2137
2138 /* This zaps the entire block. Bottom up. */
2139 BUFFER_TRACE(bh, "free child branches");
2140 ext3_free_branches(handle, inode, bh,
2141 (__le32*)bh->b_data,
2142 (__le32*)bh->b_data + addr_per_block,
2143 depth);
2144
2145 /*
2146 * We've probably journalled the indirect block several
2147 * times during the truncate. But it's no longer
2148 * needed and we now drop it from the transaction via
2149 * journal_revoke().
2150 *
2151 * That's easy if it's exclusively part of this
2152 * transaction. But if it's part of the committing
2153 * transaction then journal_forget() will simply
2154 * brelse() it. That means that if the underlying
2155 * block is reallocated in ext3_get_block(),
2156 * unmap_underlying_metadata() will find this block
2157 * and will try to get rid of it. damn, damn.
2158 *
2159 * If this block has already been committed to the
2160 * journal, a revoke record will be written. And
2161 * revoke records must be emitted *before* clearing
2162 * this block's bit in the bitmaps.
2163 */
2164 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2165
2166 /*
2167 * Everything below this this pointer has been
2168 * released. Now let this top-of-subtree go.
2169 *
2170 * We want the freeing of this indirect block to be
2171 * atomic in the journal with the updating of the
2172 * bitmap block which owns it. So make some room in
2173 * the journal.
2174 *
2175 * We zero the parent pointer *after* freeing its
2176 * pointee in the bitmaps, so if extend_transaction()
2177 * for some reason fails to put the bitmap changes and
2178 * the release into the same transaction, recovery
2179 * will merely complain about releasing a free block,
2180 * rather than leaking blocks.
2181 */
2182 if (is_handle_aborted(handle))
2183 return;
2184 if (try_to_extend_transaction(handle, inode)) {
2185 ext3_mark_inode_dirty(handle, inode);
2186 ext3_journal_test_restart(handle, inode);
2187 }
2188
2189 ext3_free_blocks(handle, inode, nr, 1);
2190
2191 if (parent_bh) {
2192 /*
2193 * The block which we have just freed is
2194 * pointed to by an indirect block: journal it
2195 */
2196 BUFFER_TRACE(parent_bh, "get_write_access");
2197 if (!ext3_journal_get_write_access(handle,
2198 parent_bh)){
2199 *p = 0;
2200 BUFFER_TRACE(parent_bh,
2201 "call ext3_journal_dirty_metadata");
2202 ext3_journal_dirty_metadata(handle,
2203 parent_bh);
2204 }
2205 }
2206 }
2207 } else {
2208 /* We have reached the bottom of the tree. */
2209 BUFFER_TRACE(parent_bh, "free data blocks");
2210 ext3_free_data(handle, inode, parent_bh, first, last);
2211 }
2212}
2213
2214/*
2215 * ext3_truncate()
2216 *
2217 * We block out ext3_get_block() block instantiations across the entire
2218 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2219 * simultaneously on behalf of the same inode.
2220 *
2221 * As we work through the truncate and commmit bits of it to the journal there
2222 * is one core, guiding principle: the file's tree must always be consistent on
2223 * disk. We must be able to restart the truncate after a crash.
2224 *
2225 * The file's tree may be transiently inconsistent in memory (although it
2226 * probably isn't), but whenever we close off and commit a journal transaction,
2227 * the contents of (the filesystem + the journal) must be consistent and
2228 * restartable. It's pretty simple, really: bottom up, right to left (although
2229 * left-to-right works OK too).
2230 *
2231 * Note that at recovery time, journal replay occurs *before* the restart of
2232 * truncate against the orphan inode list.
2233 *
2234 * The committed inode has the new, desired i_size (which is the same as
2235 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2236 * that this inode's truncate did not complete and it will again call
2237 * ext3_truncate() to have another go. So there will be instantiated blocks
2238 * to the right of the truncation point in a crashed ext3 filesystem. But
2239 * that's fine - as long as they are linked from the inode, the post-crash
2240 * ext3_truncate() run will find them and release them.
2241 */
2242
2243void ext3_truncate(struct inode * inode)
2244{
2245 handle_t *handle;
2246 struct ext3_inode_info *ei = EXT3_I(inode);
2247 __le32 *i_data = ei->i_data;
2248 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2249 struct address_space *mapping = inode->i_mapping;
2250 int offsets[4];
2251 Indirect chain[4];
2252 Indirect *partial;
2253 __le32 nr = 0;
2254 int n;
2255 long last_block;
2256 unsigned blocksize = inode->i_sb->s_blocksize;
2257 struct page *page;
2258
2259 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2260 S_ISLNK(inode->i_mode)))
2261 return;
2262 if (ext3_inode_is_fast_symlink(inode))
2263 return;
2264 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2265 return;
2266
2267 /*
2268 * We have to lock the EOF page here, because lock_page() nests
2269 * outside journal_start().
2270 */
2271 if ((inode->i_size & (blocksize - 1)) == 0) {
2272 /* Block boundary? Nothing to do */
2273 page = NULL;
2274 } else {
2275 page = grab_cache_page(mapping,
2276 inode->i_size >> PAGE_CACHE_SHIFT);
2277 if (!page)
2278 return;
2279 }
2280
2281 handle = start_transaction(inode);
2282 if (IS_ERR(handle)) {
2283 if (page) {
2284 clear_highpage(page);
2285 flush_dcache_page(page);
2286 unlock_page(page);
2287 page_cache_release(page);
2288 }
2289 return; /* AKPM: return what? */
2290 }
2291
2292 last_block = (inode->i_size + blocksize-1)
2293 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2294
2295 if (page)
2296 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2297
2298 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2299 if (n == 0)
2300 goto out_stop; /* error */
2301
2302 /*
2303 * OK. This truncate is going to happen. We add the inode to the
2304 * orphan list, so that if this truncate spans multiple transactions,
2305 * and we crash, we will resume the truncate when the filesystem
2306 * recovers. It also marks the inode dirty, to catch the new size.
2307 *
2308 * Implication: the file must always be in a sane, consistent
2309 * truncatable state while each transaction commits.
2310 */
2311 if (ext3_orphan_add(handle, inode))
2312 goto out_stop;
2313
2314 /*
2315 * The orphan list entry will now protect us from any crash which
2316 * occurs before the truncate completes, so it is now safe to propagate
2317 * the new, shorter inode size (held for now in i_size) into the
2318 * on-disk inode. We do this via i_disksize, which is the value which
2319 * ext3 *really* writes onto the disk inode.
2320 */
2321 ei->i_disksize = inode->i_size;
2322
2323 /*
2324 * From here we block out all ext3_get_block() callers who want to
2325 * modify the block allocation tree.
2326 */
97461518 2327 mutex_lock(&ei->truncate_mutex);
1da177e4
LT
2328
2329 if (n == 1) { /* direct blocks */
2330 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2331 i_data + EXT3_NDIR_BLOCKS);
2332 goto do_indirects;
2333 }
2334
2335 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2336 /* Kill the top of shared branch (not detached) */
2337 if (nr) {
2338 if (partial == chain) {
2339 /* Shared branch grows from the inode */
2340 ext3_free_branches(handle, inode, NULL,
2341 &nr, &nr+1, (chain+n-1) - partial);
2342 *partial->p = 0;
2343 /*
2344 * We mark the inode dirty prior to restart,
2345 * and prior to stop. No need for it here.
2346 */
2347 } else {
2348 /* Shared branch grows from an indirect block */
2349 BUFFER_TRACE(partial->bh, "get_write_access");
2350 ext3_free_branches(handle, inode, partial->bh,
2351 partial->p,
2352 partial->p+1, (chain+n-1) - partial);
2353 }
2354 }
2355 /* Clear the ends of indirect blocks on the shared branch */
2356 while (partial > chain) {
2357 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2358 (__le32*)partial->bh->b_data+addr_per_block,
2359 (chain+n-1) - partial);
2360 BUFFER_TRACE(partial->bh, "call brelse");
2361 brelse (partial->bh);
2362 partial--;
2363 }
2364do_indirects:
2365 /* Kill the remaining (whole) subtrees */
2366 switch (offsets[0]) {
2367 default:
2368 nr = i_data[EXT3_IND_BLOCK];
2369 if (nr) {
2370 ext3_free_branches(handle, inode, NULL,
2371 &nr, &nr+1, 1);
2372 i_data[EXT3_IND_BLOCK] = 0;
2373 }
2374 case EXT3_IND_BLOCK:
2375 nr = i_data[EXT3_DIND_BLOCK];
2376 if (nr) {
2377 ext3_free_branches(handle, inode, NULL,
2378 &nr, &nr+1, 2);
2379 i_data[EXT3_DIND_BLOCK] = 0;
2380 }
2381 case EXT3_DIND_BLOCK:
2382 nr = i_data[EXT3_TIND_BLOCK];
2383 if (nr) {
2384 ext3_free_branches(handle, inode, NULL,
2385 &nr, &nr+1, 3);
2386 i_data[EXT3_TIND_BLOCK] = 0;
2387 }
2388 case EXT3_TIND_BLOCK:
2389 ;
2390 }
2391
2392 ext3_discard_reservation(inode);
2393
97461518 2394 mutex_unlock(&ei->truncate_mutex);
1da177e4
LT
2395 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2396 ext3_mark_inode_dirty(handle, inode);
2397
2398 /* In a multi-transaction truncate, we only make the final
2399 * transaction synchronous */
2400 if (IS_SYNC(inode))
2401 handle->h_sync = 1;
2402out_stop:
2403 /*
2404 * If this was a simple ftruncate(), and the file will remain alive
2405 * then we need to clear up the orphan record which we created above.
2406 * However, if this was a real unlink then we were called by
2407 * ext3_delete_inode(), and we allow that function to clean up the
2408 * orphan info for us.
2409 */
2410 if (inode->i_nlink)
2411 ext3_orphan_del(handle, inode);
2412
2413 ext3_journal_stop(handle);
2414}
2415
2416static unsigned long ext3_get_inode_block(struct super_block *sb,
2417 unsigned long ino, struct ext3_iloc *iloc)
2418{
2419 unsigned long desc, group_desc, block_group;
2420 unsigned long offset, block;
2421 struct buffer_head *bh;
2422 struct ext3_group_desc * gdp;
2423
2424
2425 if ((ino != EXT3_ROOT_INO &&
2426 ino != EXT3_JOURNAL_INO &&
2427 ino != EXT3_RESIZE_INO &&
2428 ino < EXT3_FIRST_INO(sb)) ||
2429 ino > le32_to_cpu(
2430 EXT3_SB(sb)->s_es->s_inodes_count)) {
2431 ext3_error (sb, "ext3_get_inode_block",
2432 "bad inode number: %lu", ino);
2433 return 0;
2434 }
2435 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2436 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2437 ext3_error (sb, "ext3_get_inode_block",
2438 "group >= groups count");
2439 return 0;
2440 }
2441 smp_rmb();
2442 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2443 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2444 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2445 if (!bh) {
2446 ext3_error (sb, "ext3_get_inode_block",
2447 "Descriptor not loaded");
2448 return 0;
2449 }
2450
2451 gdp = (struct ext3_group_desc *) bh->b_data;
2452 /*
2453 * Figure out the offset within the block group inode table
2454 */
2455 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2456 EXT3_INODE_SIZE(sb);
2457 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2458 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2459
2460 iloc->block_group = block_group;
2461 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2462 return block;
2463}
2464
2465/*
2466 * ext3_get_inode_loc returns with an extra refcount against the inode's
2467 * underlying buffer_head on success. If 'in_mem' is true, we have all
2468 * data in memory that is needed to recreate the on-disk version of this
2469 * inode.
2470 */
2471static int __ext3_get_inode_loc(struct inode *inode,
2472 struct ext3_iloc *iloc, int in_mem)
2473{
2474 unsigned long block;
2475 struct buffer_head *bh;
2476
2477 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2478 if (!block)
2479 return -EIO;
2480
2481 bh = sb_getblk(inode->i_sb, block);
2482 if (!bh) {
2483 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2484 "unable to read inode block - "
2485 "inode=%lu, block=%lu", inode->i_ino, block);
2486 return -EIO;
2487 }
2488 if (!buffer_uptodate(bh)) {
2489 lock_buffer(bh);
2490 if (buffer_uptodate(bh)) {
2491 /* someone brought it uptodate while we waited */
2492 unlock_buffer(bh);
2493 goto has_buffer;
2494 }
2495
2496 /*
2497 * If we have all information of the inode in memory and this
2498 * is the only valid inode in the block, we need not read the
2499 * block.
2500 */
2501 if (in_mem) {
2502 struct buffer_head *bitmap_bh;
2503 struct ext3_group_desc *desc;
2504 int inodes_per_buffer;
2505 int inode_offset, i;
2506 int block_group;
2507 int start;
2508
2509 block_group = (inode->i_ino - 1) /
2510 EXT3_INODES_PER_GROUP(inode->i_sb);
2511 inodes_per_buffer = bh->b_size /
2512 EXT3_INODE_SIZE(inode->i_sb);
2513 inode_offset = ((inode->i_ino - 1) %
2514 EXT3_INODES_PER_GROUP(inode->i_sb));
2515 start = inode_offset & ~(inodes_per_buffer - 1);
2516
2517 /* Is the inode bitmap in cache? */
2518 desc = ext3_get_group_desc(inode->i_sb,
2519 block_group, NULL);
2520 if (!desc)
2521 goto make_io;
2522
2523 bitmap_bh = sb_getblk(inode->i_sb,
2524 le32_to_cpu(desc->bg_inode_bitmap));
2525 if (!bitmap_bh)
2526 goto make_io;
2527
2528 /*
2529 * If the inode bitmap isn't in cache then the
2530 * optimisation may end up performing two reads instead
2531 * of one, so skip it.
2532 */
2533 if (!buffer_uptodate(bitmap_bh)) {
2534 brelse(bitmap_bh);
2535 goto make_io;
2536 }
2537 for (i = start; i < start + inodes_per_buffer; i++) {
2538 if (i == inode_offset)
2539 continue;
2540 if (ext3_test_bit(i, bitmap_bh->b_data))
2541 break;
2542 }
2543 brelse(bitmap_bh);
2544 if (i == start + inodes_per_buffer) {
2545 /* all other inodes are free, so skip I/O */
2546 memset(bh->b_data, 0, bh->b_size);
2547 set_buffer_uptodate(bh);
2548 unlock_buffer(bh);
2549 goto has_buffer;
2550 }
2551 }
2552
2553make_io:
2554 /*
2555 * There are other valid inodes in the buffer, this inode
2556 * has in-inode xattrs, or we don't have this inode in memory.
2557 * Read the block from disk.
2558 */
2559 get_bh(bh);
2560 bh->b_end_io = end_buffer_read_sync;
2561 submit_bh(READ, bh);
2562 wait_on_buffer(bh);
2563 if (!buffer_uptodate(bh)) {
2564 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2565 "unable to read inode block - "
2566 "inode=%lu, block=%lu",
2567 inode->i_ino, block);
2568 brelse(bh);
2569 return -EIO;
2570 }
2571 }
2572has_buffer:
2573 iloc->bh = bh;
2574 return 0;
2575}
2576
2577int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2578{
2579 /* We have all inode data except xattrs in memory here. */
2580 return __ext3_get_inode_loc(inode, iloc,
2581 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2582}
2583
2584void ext3_set_inode_flags(struct inode *inode)
2585{
2586 unsigned int flags = EXT3_I(inode)->i_flags;
2587
2588 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2589 if (flags & EXT3_SYNC_FL)
2590 inode->i_flags |= S_SYNC;
2591 if (flags & EXT3_APPEND_FL)
2592 inode->i_flags |= S_APPEND;
2593 if (flags & EXT3_IMMUTABLE_FL)
2594 inode->i_flags |= S_IMMUTABLE;
2595 if (flags & EXT3_NOATIME_FL)
2596 inode->i_flags |= S_NOATIME;
2597 if (flags & EXT3_DIRSYNC_FL)
2598 inode->i_flags |= S_DIRSYNC;
2599}
2600
2601void ext3_read_inode(struct inode * inode)
2602{
2603 struct ext3_iloc iloc;
2604 struct ext3_inode *raw_inode;
2605 struct ext3_inode_info *ei = EXT3_I(inode);
2606 struct buffer_head *bh;
2607 int block;
2608
2609#ifdef CONFIG_EXT3_FS_POSIX_ACL
2610 ei->i_acl = EXT3_ACL_NOT_CACHED;
2611 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2612#endif
2613 ei->i_block_alloc_info = NULL;
2614
2615 if (__ext3_get_inode_loc(inode, &iloc, 0))
2616 goto bad_inode;
2617 bh = iloc.bh;
2618 raw_inode = ext3_raw_inode(&iloc);
2619 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2620 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2621 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2622 if(!(test_opt (inode->i_sb, NO_UID32))) {
2623 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2624 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2625 }
2626 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2627 inode->i_size = le32_to_cpu(raw_inode->i_size);
2628 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2629 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2630 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2631 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2632
2633 ei->i_state = 0;
2634 ei->i_dir_start_lookup = 0;
2635 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2636 /* We now have enough fields to check if the inode was active or not.
2637 * This is needed because nfsd might try to access dead inodes
2638 * the test is that same one that e2fsck uses
2639 * NeilBrown 1999oct15
2640 */
2641 if (inode->i_nlink == 0) {
2642 if (inode->i_mode == 0 ||
2643 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2644 /* this inode is deleted */
2645 brelse (bh);
2646 goto bad_inode;
2647 }
2648 /* The only unlinked inodes we let through here have
2649 * valid i_mode and are being read by the orphan
2650 * recovery code: that's fine, we're about to complete
2651 * the process of deleting those. */
2652 }
2653 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2654 * (for stat), not the fs block
2655 * size */
2656 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2657 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2658#ifdef EXT3_FRAGMENTS
2659 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2660 ei->i_frag_no = raw_inode->i_frag;
2661 ei->i_frag_size = raw_inode->i_fsize;
2662#endif
2663 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2664 if (!S_ISREG(inode->i_mode)) {
2665 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2666 } else {
2667 inode->i_size |=
2668 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2669 }
2670 ei->i_disksize = inode->i_size;
2671 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2672 ei->i_block_group = iloc.block_group;
2673 /*
2674 * NOTE! The in-memory inode i_data array is in little-endian order
2675 * even on big-endian machines: we do NOT byteswap the block numbers!
2676 */
2677 for (block = 0; block < EXT3_N_BLOCKS; block++)
2678 ei->i_data[block] = raw_inode->i_block[block];
2679 INIT_LIST_HEAD(&ei->i_orphan);
2680
2681 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2682 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2683 /*
2684 * When mke2fs creates big inodes it does not zero out
2685 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2686 * so ignore those first few inodes.
2687 */
2688 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2689 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2690 EXT3_INODE_SIZE(inode->i_sb))
2691 goto bad_inode;
2692 if (ei->i_extra_isize == 0) {
2693 /* The extra space is currently unused. Use it. */
2694 ei->i_extra_isize = sizeof(struct ext3_inode) -
2695 EXT3_GOOD_OLD_INODE_SIZE;
2696 } else {
2697 __le32 *magic = (void *)raw_inode +
2698 EXT3_GOOD_OLD_INODE_SIZE +
2699 ei->i_extra_isize;
2700 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2701 ei->i_state |= EXT3_STATE_XATTR;
2702 }
2703 } else
2704 ei->i_extra_isize = 0;
2705
2706 if (S_ISREG(inode->i_mode)) {
2707 inode->i_op = &ext3_file_inode_operations;
2708 inode->i_fop = &ext3_file_operations;
2709 ext3_set_aops(inode);
2710 } else if (S_ISDIR(inode->i_mode)) {
2711 inode->i_op = &ext3_dir_inode_operations;
2712 inode->i_fop = &ext3_dir_operations;
2713 } else if (S_ISLNK(inode->i_mode)) {
2714 if (ext3_inode_is_fast_symlink(inode))
2715 inode->i_op = &ext3_fast_symlink_inode_operations;
2716 else {
2717 inode->i_op = &ext3_symlink_inode_operations;
2718 ext3_set_aops(inode);
2719 }
2720 } else {
2721 inode->i_op = &ext3_special_inode_operations;
2722 if (raw_inode->i_block[0])
2723 init_special_inode(inode, inode->i_mode,
2724 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2725 else
2726 init_special_inode(inode, inode->i_mode,
2727 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2728 }
2729 brelse (iloc.bh);
2730 ext3_set_inode_flags(inode);
2731 return;
2732
2733bad_inode:
2734 make_bad_inode(inode);
2735 return;
2736}
2737
2738/*
2739 * Post the struct inode info into an on-disk inode location in the
2740 * buffer-cache. This gobbles the caller's reference to the
2741 * buffer_head in the inode location struct.
2742 *
2743 * The caller must have write access to iloc->bh.
2744 */
2745static int ext3_do_update_inode(handle_t *handle,
2746 struct inode *inode,
2747 struct ext3_iloc *iloc)
2748{
2749 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2750 struct ext3_inode_info *ei = EXT3_I(inode);
2751 struct buffer_head *bh = iloc->bh;
2752 int err = 0, rc, block;
2753
2754 /* For fields not not tracking in the in-memory inode,
2755 * initialise them to zero for new inodes. */
2756 if (ei->i_state & EXT3_STATE_NEW)
2757 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2758
2759 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2760 if(!(test_opt(inode->i_sb, NO_UID32))) {
2761 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2762 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2763/*
2764 * Fix up interoperability with old kernels. Otherwise, old inodes get
2765 * re-used with the upper 16 bits of the uid/gid intact
2766 */
2767 if(!ei->i_dtime) {
2768 raw_inode->i_uid_high =
2769 cpu_to_le16(high_16_bits(inode->i_uid));
2770 raw_inode->i_gid_high =
2771 cpu_to_le16(high_16_bits(inode->i_gid));
2772 } else {
2773 raw_inode->i_uid_high = 0;
2774 raw_inode->i_gid_high = 0;
2775 }
2776 } else {
2777 raw_inode->i_uid_low =
2778 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2779 raw_inode->i_gid_low =
2780 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2781 raw_inode->i_uid_high = 0;
2782 raw_inode->i_gid_high = 0;
2783 }
2784 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2785 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2786 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2787 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2788 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2789 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2790 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2791 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2792#ifdef EXT3_FRAGMENTS
2793 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2794 raw_inode->i_frag = ei->i_frag_no;
2795 raw_inode->i_fsize = ei->i_frag_size;
2796#endif
2797 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2798 if (!S_ISREG(inode->i_mode)) {
2799 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2800 } else {
2801 raw_inode->i_size_high =
2802 cpu_to_le32(ei->i_disksize >> 32);
2803 if (ei->i_disksize > 0x7fffffffULL) {
2804 struct super_block *sb = inode->i_sb;
2805 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2806 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2807 EXT3_SB(sb)->s_es->s_rev_level ==
2808 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2809 /* If this is the first large file
2810 * created, add a flag to the superblock.
2811 */
2812 err = ext3_journal_get_write_access(handle,
2813 EXT3_SB(sb)->s_sbh);
2814 if (err)
2815 goto out_brelse;
2816 ext3_update_dynamic_rev(sb);
2817 EXT3_SET_RO_COMPAT_FEATURE(sb,
2818 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2819 sb->s_dirt = 1;
2820 handle->h_sync = 1;
2821 err = ext3_journal_dirty_metadata(handle,
2822 EXT3_SB(sb)->s_sbh);
2823 }
2824 }
2825 }
2826 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2827 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2828 if (old_valid_dev(inode->i_rdev)) {
2829 raw_inode->i_block[0] =
2830 cpu_to_le32(old_encode_dev(inode->i_rdev));
2831 raw_inode->i_block[1] = 0;
2832 } else {
2833 raw_inode->i_block[0] = 0;
2834 raw_inode->i_block[1] =
2835 cpu_to_le32(new_encode_dev(inode->i_rdev));
2836 raw_inode->i_block[2] = 0;
2837 }
2838 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2839 raw_inode->i_block[block] = ei->i_data[block];
2840
ff87b37d 2841 if (ei->i_extra_isize)
1da177e4
LT
2842 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2843
2844 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2845 rc = ext3_journal_dirty_metadata(handle, bh);
2846 if (!err)
2847 err = rc;
2848 ei->i_state &= ~EXT3_STATE_NEW;
2849
2850out_brelse:
2851 brelse (bh);
2852 ext3_std_error(inode->i_sb, err);
2853 return err;
2854}
2855
2856/*
2857 * ext3_write_inode()
2858 *
2859 * We are called from a few places:
2860 *
2861 * - Within generic_file_write() for O_SYNC files.
2862 * Here, there will be no transaction running. We wait for any running
2863 * trasnaction to commit.
2864 *
2865 * - Within sys_sync(), kupdate and such.
2866 * We wait on commit, if tol to.
2867 *
2868 * - Within prune_icache() (PF_MEMALLOC == true)
2869 * Here we simply return. We can't afford to block kswapd on the
2870 * journal commit.
2871 *
2872 * In all cases it is actually safe for us to return without doing anything,
2873 * because the inode has been copied into a raw inode buffer in
2874 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2875 * knfsd.
2876 *
2877 * Note that we are absolutely dependent upon all inode dirtiers doing the
2878 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2879 * which we are interested.
2880 *
2881 * It would be a bug for them to not do this. The code:
2882 *
2883 * mark_inode_dirty(inode)
2884 * stuff();
2885 * inode->i_size = expr;
2886 *
2887 * is in error because a kswapd-driven write_inode() could occur while
2888 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2889 * will no longer be on the superblock's dirty inode list.
2890 */
2891int ext3_write_inode(struct inode *inode, int wait)
2892{
2893 if (current->flags & PF_MEMALLOC)
2894 return 0;
2895
2896 if (ext3_journal_current_handle()) {
2897 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2898 dump_stack();
2899 return -EIO;
2900 }
2901
2902 if (!wait)
2903 return 0;
2904
2905 return ext3_force_commit(inode->i_sb);
2906}
2907
2908/*
2909 * ext3_setattr()
2910 *
2911 * Called from notify_change.
2912 *
2913 * We want to trap VFS attempts to truncate the file as soon as
2914 * possible. In particular, we want to make sure that when the VFS
2915 * shrinks i_size, we put the inode on the orphan list and modify
2916 * i_disksize immediately, so that during the subsequent flushing of
2917 * dirty pages and freeing of disk blocks, we can guarantee that any
2918 * commit will leave the blocks being flushed in an unused state on
2919 * disk. (On recovery, the inode will get truncated and the blocks will
2920 * be freed, so we have a strong guarantee that no future commit will
2921 * leave these blocks visible to the user.)
2922 *
2923 * Called with inode->sem down.
2924 */
2925int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2926{
2927 struct inode *inode = dentry->d_inode;
2928 int error, rc = 0;
2929 const unsigned int ia_valid = attr->ia_valid;
2930
2931 error = inode_change_ok(inode, attr);
2932 if (error)
2933 return error;
2934
2935 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2936 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2937 handle_t *handle;
2938
2939 /* (user+group)*(old+new) structure, inode write (sb,
2940 * inode block, ? - but truncate inode update has it) */
1f54587b
JK
2941 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2942 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
1da177e4
LT
2943 if (IS_ERR(handle)) {
2944 error = PTR_ERR(handle);
2945 goto err_out;
2946 }
2947 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2948 if (error) {
2949 ext3_journal_stop(handle);
2950 return error;
2951 }
2952 /* Update corresponding info in inode so that everything is in
2953 * one transaction */
2954 if (attr->ia_valid & ATTR_UID)
2955 inode->i_uid = attr->ia_uid;
2956 if (attr->ia_valid & ATTR_GID)
2957 inode->i_gid = attr->ia_gid;
2958 error = ext3_mark_inode_dirty(handle, inode);
2959 ext3_journal_stop(handle);
2960 }
2961
2962 if (S_ISREG(inode->i_mode) &&
2963 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2964 handle_t *handle;
2965
2966 handle = ext3_journal_start(inode, 3);
2967 if (IS_ERR(handle)) {
2968 error = PTR_ERR(handle);
2969 goto err_out;
2970 }
2971
2972 error = ext3_orphan_add(handle, inode);
2973 EXT3_I(inode)->i_disksize = attr->ia_size;
2974 rc = ext3_mark_inode_dirty(handle, inode);
2975 if (!error)
2976 error = rc;
2977 ext3_journal_stop(handle);
2978 }
2979
2980 rc = inode_setattr(inode, attr);
2981
2982 /* If inode_setattr's call to ext3_truncate failed to get a
2983 * transaction handle at all, we need to clean up the in-core
2984 * orphan list manually. */
2985 if (inode->i_nlink)
2986 ext3_orphan_del(NULL, inode);
2987
2988 if (!rc && (ia_valid & ATTR_MODE))
2989 rc = ext3_acl_chmod(inode);
2990
2991err_out:
2992 ext3_std_error(inode->i_sb, error);
2993 if (!error)
2994 error = rc;
2995 return error;
2996}
2997
2998
2999/*
3000 * akpm: how many blocks doth make a writepage()?
3001 *
3002 * With N blocks per page, it may be:
3003 * N data blocks
3004 * 2 indirect block
3005 * 2 dindirect
3006 * 1 tindirect
3007 * N+5 bitmap blocks (from the above)
3008 * N+5 group descriptor summary blocks
3009 * 1 inode block
3010 * 1 superblock.
3011 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3012 *
3013 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3014 *
3015 * With ordered or writeback data it's the same, less the N data blocks.
3016 *
3017 * If the inode's direct blocks can hold an integral number of pages then a
3018 * page cannot straddle two indirect blocks, and we can only touch one indirect
3019 * and dindirect block, and the "5" above becomes "3".
3020 *
3021 * This still overestimates under most circumstances. If we were to pass the
3022 * start and end offsets in here as well we could do block_to_path() on each
3023 * block and work out the exact number of indirects which are touched. Pah.
3024 */
3025
3026static int ext3_writepage_trans_blocks(struct inode *inode)
3027{
3028 int bpp = ext3_journal_blocks_per_page(inode);
3029 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3030 int ret;
3031
3032 if (ext3_should_journal_data(inode))
3033 ret = 3 * (bpp + indirects) + 2;
3034 else
3035 ret = 2 * (bpp + indirects) + 2;
3036
3037#ifdef CONFIG_QUOTA
3038 /* We know that structure was already allocated during DQUOT_INIT so
3039 * we will be updating only the data blocks + inodes */
1f54587b 3040 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
1da177e4
LT
3041#endif
3042
3043 return ret;
3044}
3045
3046/*
3047 * The caller must have previously called ext3_reserve_inode_write().
3048 * Give this, we know that the caller already has write access to iloc->bh.
3049 */
3050int ext3_mark_iloc_dirty(handle_t *handle,
3051 struct inode *inode, struct ext3_iloc *iloc)
3052{
3053 int err = 0;
3054
3055 /* the do_update_inode consumes one bh->b_count */
3056 get_bh(iloc->bh);
3057
3058 /* ext3_do_update_inode() does journal_dirty_metadata */
3059 err = ext3_do_update_inode(handle, inode, iloc);
3060 put_bh(iloc->bh);
3061 return err;
3062}
3063
3064/*
3065 * On success, We end up with an outstanding reference count against
3066 * iloc->bh. This _must_ be cleaned up later.
3067 */
3068
3069int
3070ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3071 struct ext3_iloc *iloc)
3072{
3073 int err = 0;
3074 if (handle) {
3075 err = ext3_get_inode_loc(inode, iloc);
3076 if (!err) {
3077 BUFFER_TRACE(iloc->bh, "get_write_access");
3078 err = ext3_journal_get_write_access(handle, iloc->bh);
3079 if (err) {
3080 brelse(iloc->bh);
3081 iloc->bh = NULL;
3082 }
3083 }
3084 }
3085 ext3_std_error(inode->i_sb, err);
3086 return err;
3087}
3088
3089/*
3090 * akpm: What we do here is to mark the in-core inode as clean
3091 * with respect to inode dirtiness (it may still be data-dirty).
3092 * This means that the in-core inode may be reaped by prune_icache
3093 * without having to perform any I/O. This is a very good thing,
3094 * because *any* task may call prune_icache - even ones which
3095 * have a transaction open against a different journal.
3096 *
3097 * Is this cheating? Not really. Sure, we haven't written the
3098 * inode out, but prune_icache isn't a user-visible syncing function.
3099 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3100 * we start and wait on commits.
3101 *
3102 * Is this efficient/effective? Well, we're being nice to the system
3103 * by cleaning up our inodes proactively so they can be reaped
3104 * without I/O. But we are potentially leaving up to five seconds'
3105 * worth of inodes floating about which prune_icache wants us to
3106 * write out. One way to fix that would be to get prune_icache()
3107 * to do a write_super() to free up some memory. It has the desired
3108 * effect.
3109 */
3110int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3111{
3112 struct ext3_iloc iloc;
3113 int err;
3114
3115 might_sleep();
3116 err = ext3_reserve_inode_write(handle, inode, &iloc);
3117 if (!err)
3118 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3119 return err;
3120}
3121
3122/*
3123 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3124 *
3125 * We're really interested in the case where a file is being extended.
3126 * i_size has been changed by generic_commit_write() and we thus need
3127 * to include the updated inode in the current transaction.
3128 *
3129 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3130 * are allocated to the file.
3131 *
3132 * If the inode is marked synchronous, we don't honour that here - doing
3133 * so would cause a commit on atime updates, which we don't bother doing.
3134 * We handle synchronous inodes at the highest possible level.
3135 */
3136void ext3_dirty_inode(struct inode *inode)
3137{
3138 handle_t *current_handle = ext3_journal_current_handle();
3139 handle_t *handle;
3140
3141 handle = ext3_journal_start(inode, 2);
3142 if (IS_ERR(handle))
3143 goto out;
3144 if (current_handle &&
3145 current_handle->h_transaction != handle->h_transaction) {
3146 /* This task has a transaction open against a different fs */
3147 printk(KERN_EMERG "%s: transactions do not match!\n",
3148 __FUNCTION__);
3149 } else {
3150 jbd_debug(5, "marking dirty. outer handle=%p\n",
3151 current_handle);
3152 ext3_mark_inode_dirty(handle, inode);
3153 }
3154 ext3_journal_stop(handle);
3155out:
3156 return;
3157}
3158
3159#ifdef AKPM
3160/*
3161 * Bind an inode's backing buffer_head into this transaction, to prevent
3162 * it from being flushed to disk early. Unlike
3163 * ext3_reserve_inode_write, this leaves behind no bh reference and
3164 * returns no iloc structure, so the caller needs to repeat the iloc
3165 * lookup to mark the inode dirty later.
3166 */
3167static inline int
3168ext3_pin_inode(handle_t *handle, struct inode *inode)
3169{
3170 struct ext3_iloc iloc;
3171
3172 int err = 0;
3173 if (handle) {
3174 err = ext3_get_inode_loc(inode, &iloc);
3175 if (!err) {
3176 BUFFER_TRACE(iloc.bh, "get_write_access");
3177 err = journal_get_write_access(handle, iloc.bh);
3178 if (!err)
3179 err = ext3_journal_dirty_metadata(handle,
3180 iloc.bh);
3181 brelse(iloc.bh);
3182 }
3183 }
3184 ext3_std_error(inode->i_sb, err);
3185 return err;
3186}
3187#endif
3188
3189int ext3_change_inode_journal_flag(struct inode *inode, int val)
3190{
3191 journal_t *journal;
3192 handle_t *handle;
3193 int err;
3194
3195 /*
3196 * We have to be very careful here: changing a data block's
3197 * journaling status dynamically is dangerous. If we write a
3198 * data block to the journal, change the status and then delete
3199 * that block, we risk forgetting to revoke the old log record
3200 * from the journal and so a subsequent replay can corrupt data.
3201 * So, first we make sure that the journal is empty and that
3202 * nobody is changing anything.
3203 */
3204
3205 journal = EXT3_JOURNAL(inode);
3206 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3207 return -EROFS;
3208
3209 journal_lock_updates(journal);
3210 journal_flush(journal);
3211
3212 /*
3213 * OK, there are no updates running now, and all cached data is
3214 * synced to disk. We are now in a completely consistent state
3215 * which doesn't have anything in the journal, and we know that
3216 * no filesystem updates are running, so it is safe to modify
3217 * the inode's in-core data-journaling state flag now.
3218 */
3219
3220 if (val)
3221 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3222 else
3223 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3224 ext3_set_aops(inode);
3225
3226 journal_unlock_updates(journal);
3227
3228 /* Finally we can mark the inode as dirty. */
3229
3230 handle = ext3_journal_start(inode, 1);
3231 if (IS_ERR(handle))
3232 return PTR_ERR(handle);
3233
3234 err = ext3_mark_inode_dirty(handle, inode);
3235 handle->h_sync = 1;
3236 ext3_journal_stop(handle);
3237 ext3_std_error(inode->i_sb, err);
3238
3239 return err;
3240}