Merge tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Mar 2013 00:56:10 +0000 (17:56 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Mar 2013 00:56:10 +0000 (17:56 -0700)
Pull ext4 fixes from Ted Ts'o:
 "Fix a number of regression and other bugs in ext4, most of which were
  relatively obscure cornercases or races that were found using
  regression tests."

* tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
  ext4: fix data=journal fast mount/umount hang
  ext4: fix ext4_evict_inode() racing against workqueue processing code
  ext4: fix memory leakage in mext_check_coverage
  ext4: use s_extent_max_zeroout_kb value as number of kb
  ext4: use atomic64_t for the per-flexbg free_clusters count
  jbd2: fix use after free in jbd2_journal_dirty_metadata()
  ext4: reserve metadata block for every delayed write
  ext4: update reserved space after the 'correction'
  ext4: do not use yield()
  ext4: remove unused variable in ext4_free_blocks()
  ext4: fix WARN_ON from ext4_releasepage()
  ext4: fix the wrong number of the allocated blocks in ext4_split_extent()
  ext4: update extent status tree after an extent is zeroed out
  ext4: fix wrong m_len value after unwritten extent conversion
  ext4: add self-testing infrastructure to do a sanity check
  ext4: avoid a potential overflow in ext4_es_can_be_merged()
  ext4: invalidate extent status tree during extent migration
  ext4: remove unnecessary wait for extent conversion in ext4_fallocate()
  ext4: add warning to ext4_convert_unwritten_extents_endio
  ext4: disable merging of uninitialized extents
  ...

12 files changed:
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/extents_status.h
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/move_extent.c
fs/ext4/page-io.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/transaction.c

index 4a01ba315262259241081ec695c1f1d3d179eb32..3b83cd6047964ab7218aeb26ab73bce7be472430 100644 (file)
@@ -335,9 +335,9 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-       atomic_t free_inodes;
-       atomic_t free_clusters;
-       atomic_t used_dirs;
+       atomic64_t      free_clusters;
+       atomic_t        free_inodes;
+       atomic_t        used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT   0x0001 /* Inode table/bitmap not in use */
@@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 extern int __init ext4_init_pageio(void);
 extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_wait(struct inode *);
+extern void ext4_ioend_shutdown(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern void ext4_end_io_work(struct work_struct *work);
index 28dd8eeea6a93f2ce5ae727777ede13582e2038f..56efcaadf8485a5887ae04493b9cacf28b5c73ef 100644 (file)
@@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
        unsigned short ext1_ee_len, ext2_ee_len, max_len;
 
        /*
-        * Make sure that either both extents are uninitialized, or
-        * both are _not_.
+        * Make sure that both extents are initialized. We don't merge
+        * uninitialized extents so that we can be sure that end_io code has
+        * the extent that was written properly split out and conversion to
+        * initialized is trivial.
         */
-       if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+       if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
                return 0;
 
        if (ext4_ext_is_uninitialized(ex1))
@@ -2923,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle,
 {
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
-       struct ext4_extent *ex, newex, orig_ex;
+       struct ext4_extent *ex, newex, orig_ex, zero_ex;
        struct ext4_extent *ex2 = NULL;
        unsigned int ee_len, depth;
        int err = 0;
@@ -2943,6 +2945,10 @@ static int ext4_split_extent_at(handle_t *handle,
        newblock = split - ee_block + ext4_ext_pblock(ex);
 
        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+       BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+              split_flag & (EXT4_EXT_MAY_ZEROOUT |
+                            EXT4_EXT_MARK_UNINIT1 |
+                            EXT4_EXT_MARK_UNINIT2));
 
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
@@ -2990,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle,
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
-                       if (split_flag & EXT4_EXT_DATA_VALID1)
+                       if (split_flag & EXT4_EXT_DATA_VALID1) {
                                err = ext4_ext_zeroout(inode, ex2);
-                       else
+                               zero_ex.ee_block = ex2->ee_block;
+                               zero_ex.ee_len = ext4_ext_get_actual_len(ex2);
+                               ext4_ext_store_pblock(&zero_ex,
+                                                     ext4_ext_pblock(ex2));
+                       } else {
                                err = ext4_ext_zeroout(inode, ex);
-               } else
+                               zero_ex.ee_block = ex->ee_block;
+                               zero_ex.ee_len = ext4_ext_get_actual_len(ex);
+                               ext4_ext_store_pblock(&zero_ex,
+                                                     ext4_ext_pblock(ex));
+                       }
+               } else {
                        err = ext4_ext_zeroout(inode, &orig_ex);
+                       zero_ex.ee_block = orig_ex.ee_block;
+                       zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex);
+                       ext4_ext_store_pblock(&zero_ex,
+                                             ext4_ext_pblock(&orig_ex));
+               }
 
                if (err)
                        goto fix_extent_len;
@@ -3003,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle,
                ex->ee_len = cpu_to_le16(ee_len);
                ext4_ext_try_to_merge(handle, inode, path, ex);
                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+               if (err)
+                       goto fix_extent_len;
+
+               /* update extent status tree */
+               err = ext4_es_zeroout(inode, &zero_ex);
+
                goto out;
        } else if (err)
                goto fix_extent_len;
@@ -3041,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle,
        int err = 0;
        int uninitialized;
        int split_flag1, flags1;
+       int allocated = map->m_len;
 
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
@@ -3060,20 +3087,29 @@ static int ext4_split_extent(handle_t *handle,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
                        goto out;
+       } else {
+               allocated = ee_len - (map->m_lblk - ee_block);
        }
-
+       /*
+        * Update path is required because previous ext4_split_extent_at() may
+        * result in split of original leaf or extent zeroout.
+        */
        ext4_ext_drop_refs(path);
        path = ext4_ext_find_extent(inode, map->m_lblk, path);
        if (IS_ERR(path))
                return PTR_ERR(path);
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       uninitialized = ext4_ext_is_uninitialized(ex);
+       split_flag1 = 0;
 
        if (map->m_lblk >= ee_block) {
-               split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
-                                           EXT4_EXT_DATA_VALID2);
-               if (uninitialized)
+               split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+               if (uninitialized) {
                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
-               if (split_flag & EXT4_EXT_MARK_UNINIT2)
-                       split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+                       split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+                                                    EXT4_EXT_MARK_UNINIT2);
+               }
                err = ext4_split_extent_at(handle, inode, path,
                                map->m_lblk, split_flag1, flags);
                if (err)
@@ -3082,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle,
 
        ext4_ext_show_leaf(inode, path);
 out:
-       return err ? err : map->m_len;
+       return err ? err : allocated;
 }
 
 /*
@@ -3137,6 +3173,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
+       zero_ex.ee_len = 0;
 
        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
 
@@ -3227,13 +3264,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
                max_zeroout = sbi->s_extent_max_zeroout_kb >>
-                       inode->i_sb->s_blocksize_bits;
+                       (inode->i_sb->s_blocksize_bits - 10);
 
        /* If extent is less than s_max_zeroout_kb, zeroout directly */
        if (max_zeroout && (ee_len <= max_zeroout)) {
                err = ext4_ext_zeroout(inode, ex);
                if (err)
                        goto out;
+               zero_ex.ee_block = ex->ee_block;
+               zero_ex.ee_len = ext4_ext_get_actual_len(ex);
+               ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
 
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -3292,6 +3332,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                err = allocated;
 
 out:
+       /* If we have gotten a failure, don't zero out status tree */
+       if (!err)
+               err = ext4_es_zeroout(inode, &zero_ex);
        return err ? err : allocated;
 }
 
@@ -3374,8 +3417,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                "block %llu, max_blocks %u\n", inode->i_ino,
                  (unsigned long long)ee_block, ee_len);
 
-       /* If extent is larger than requested then split is required */
+       /* If extent is larger than requested it is a clear sign that we still
+        * have some extent state machine issues left. So extent_split is still
+        * required.
+        * TODO: Once all related issues will be fixed this situation should be
+        * illegal.
+        */
        if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+               ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+                            " len %u; IO logical block %llu, len %u\n",
+                            inode->i_ino, (unsigned long long)ee_block, ee_len,
+                            (unsigned long long)map->m_lblk, map->m_len);
+#endif
                err = ext4_split_unwritten_extents(handle, inode, map, path,
                                                   EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
@@ -3626,6 +3680,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                 path, map->m_len);
                } else
                        err = ret;
+               map->m_flags |= EXT4_MAP_MAPPED;
+               if (allocated > map->m_len)
+                       allocated = map->m_len;
+               map->m_len = allocated;
                goto out2;
        }
        /* buffered IO case */
@@ -3675,6 +3733,7 @@ out:
                                        allocated - map->m_len);
                allocated = map->m_len;
        }
+       map->m_len = allocated;
 
        /*
         * If we have done fallocate with the offset that is already
@@ -4106,9 +4165,6 @@ got_allocated_blocks:
                        }
                } else {
                        BUG_ON(allocated_clusters < reserved_clusters);
-                       /* We will claim quota for all newly allocated blocks.*/
-                       ext4_da_update_reserve_space(inode, allocated_clusters,
-                                                       1);
                        if (reserved_clusters < allocated_clusters) {
                                struct ext4_inode_info *ei = EXT4_I(inode);
                                int reservation = allocated_clusters -
@@ -4159,6 +4215,15 @@ got_allocated_blocks:
                                ei->i_reserved_data_blocks += reservation;
                                spin_unlock(&ei->i_block_reservation_lock);
                        }
+                       /*
+                        * We will claim quota for all newly allocated blocks.
+                        * We're updating the reserved space *after* the
+                        * correction above so we do not accidentally free
+                        * all the metadata reservation because we might
+                        * actually need it later on.
+                        */
+                       ext4_da_update_reserve_space(inode, allocated_clusters,
+                                                       1);
                }
        }
 
@@ -4368,8 +4433,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-       /* Prevent race condition between unwritten */
-       ext4_flush_unwritten_io(inode);
 retry:
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk = map.m_lblk + ret;
index 95796a1b7522b7e02dd72dfab15fbf2cebdb6b13..fe3337a85edeaecd6135333759173d8c6fbc0e78 100644 (file)
@@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
 {
-       if (es1->es_lblk + es1->es_len != es2->es_lblk)
+       if (ext4_es_status(es1) != ext4_es_status(es2))
                return 0;
 
-       if (ext4_es_status(es1) != ext4_es_status(es2))
+       if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
                return 0;
 
-       if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
-           (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2)))
+       if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;
 
-       return 1;
+       if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+           (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+               return 1;
+
+       if (ext4_es_is_hole(es1))
+               return 1;
+
+       /* we need to check delayed extent is without unwritten status */
+       if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+               return 1;
+
+       return 0;
 }
 
 static struct extent_status *
@@ -389,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
        return es;
 }
 
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+                                           struct extent_status *es)
+{
+       struct ext4_ext_path *path = NULL;
+       struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       ext4_fsblk_t ee_start;
+       unsigned short ee_len;
+       int depth, ee_status, es_status;
+
+       path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+       if (IS_ERR(path))
+               return;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+
+       if (ex) {
+
+               ee_block = le32_to_cpu(ex->ee_block);
+               ee_start = ext4_ext_pblock(ex);
+               ee_len = ext4_ext_get_actual_len(ex);
+
+               ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+               es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+               /*
+                * Make sure ex and es are not overlap when we try to insert
+                * a delayed/hole extent.
+                */
+               if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+                       if (in_range(es->es_lblk, ee_block, ee_len)) {
+                               pr_warn("ES insert assertation failed for "
+                                       "inode: %lu we can find an extent "
+                                       "at block [%d/%d/%llu/%c], but we "
+                                       "want to add an delayed/hole extent "
+                                       "[%d/%d/%llu/%llx]\n",
+                                       inode->i_ino, ee_block, ee_len,
+                                       ee_start, ee_status ? 'u' : 'w',
+                                       es->es_lblk, es->es_len,
+                                       ext4_es_pblock(es), ext4_es_status(es));
+                       }
+                       goto out;
+               }
+
+               /*
+                * We don't check ee_block == es->es_lblk, etc. because es
+                * might be a part of whole extent, vice versa.
+                */
+               if (es->es_lblk < ee_block ||
+                   ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+                       pr_warn("ES insert assertation failed for inode: %lu "
+                               "ex_status [%d/%d/%llu/%c] != "
+                               "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+                               ee_block, ee_len, ee_start,
+                               ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+                               ext4_es_pblock(es), es_status ? 'u' : 'w');
+                       goto out;
+               }
+
+               if (ee_status ^ es_status) {
+                       pr_warn("ES insert assertation failed for inode: %lu "
+                               "ex_status [%d/%d/%llu/%c] != "
+                               "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+                               ee_block, ee_len, ee_start,
+                               ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+                               ext4_es_pblock(es), es_status ? 'u' : 'w');
+               }
+       } else {
+               /*
+                * We can't find an extent on disk.  So we need to make sure
+                * that we don't want to add an written/unwritten extent.
+                */
+               if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+                       pr_warn("ES insert assertation failed for inode: %lu "
+                               "can't find an extent at block %d but we want "
+                               "to add an written/unwritten extent "
+                               "[%d/%d/%llu/%llx]\n", inode->i_ino,
+                               es->es_lblk, es->es_lblk, es->es_len,
+                               ext4_es_pblock(es), ext4_es_status(es));
+               }
+       }
+out:
+       if (path) {
+               ext4_ext_drop_refs(path);
+               kfree(path);
+       }
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+                                           struct extent_status *es)
+{
+       struct ext4_map_blocks map;
+       int retval;
+
+       /*
+        * Here we call ext4_ind_map_blocks to lookup a block mapping because
+        * 'Indirect' structure is defined in indirect.c.  So we couldn't
+        * access direct/indirect tree from outside.  It is too dirty to define
+        * this function in indirect.c file.
+        */
+
+       map.m_lblk = es->es_lblk;
+       map.m_len = es->es_len;
+
+       retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+       if (retval > 0) {
+               if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+                       /*
+                        * We want to add a delayed/hole extent but this
+                        * block has been allocated.
+                        */
+                       pr_warn("ES insert assertation failed for inode: %lu "
+                               "We can find blocks but we want to add a "
+                               "delayed/hole extent [%d/%d/%llu/%llx]\n",
+                               inode->i_ino, es->es_lblk, es->es_len,
+                               ext4_es_pblock(es), ext4_es_status(es));
+                       return;
+               } else if (ext4_es_is_written(es)) {
+                       if (retval != es->es_len) {
+                               pr_warn("ES insert assertation failed for "
+                                       "inode: %lu retval %d != es_len %d\n",
+                                       inode->i_ino, retval, es->es_len);
+                               return;
+                       }
+                       if (map.m_pblk != ext4_es_pblock(es)) {
+                               pr_warn("ES insert assertation failed for "
+                                       "inode: %lu m_pblk %llu != "
+                                       "es_pblk %llu\n",
+                                       inode->i_ino, map.m_pblk,
+                                       ext4_es_pblock(es));
+                               return;
+                       }
+               } else {
+                       /*
+                        * We don't need to check unwritten extent because
+                        * indirect-based file doesn't have it.
+                        */
+                       BUG_ON(1);
+               }
+       } else if (retval == 0) {
+               if (ext4_es_is_written(es)) {
+                       pr_warn("ES insert assertation failed for inode: %lu "
+                               "We can't find the block but we want to add "
+                               "an written extent [%d/%d/%llu/%llx]\n",
+                               inode->i_ino, es->es_lblk, es->es_len,
+                               ext4_es_pblock(es), ext4_es_status(es));
+                       return;
+               }
+       }
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+                                              struct extent_status *es)
+{
+       /*
+        * We don't need to worry about the race condition because
+        * caller takes i_data_sem locking.
+        */
+       BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               ext4_es_insert_extent_ext_check(inode, es);
+       else
+               ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+                                              struct extent_status *es)
+{
+}
+#endif
+
 static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
 {
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
@@ -471,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        ext4_es_store_status(&newes, status);
        trace_ext4_es_insert_extent(inode, &newes);
 
+       ext4_es_insert_extent_check(inode, &newes);
+
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end);
        if (err != 0)
@@ -669,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        return err;
 }
 
+int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+       ext4_lblk_t  ee_block;
+       ext4_fsblk_t ee_pblock;
+       unsigned int ee_len;
+
+       ee_block  = le32_to_cpu(ex->ee_block);
+       ee_len    = ext4_ext_get_actual_len(ex);
+       ee_pblock = ext4_ext_pblock(ex);
+
+       if (ee_len == 0)
+               return 0;
+
+       return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+                                    EXTENT_STATUS_WRITTEN);
+}
+
 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
        struct ext4_sb_info *sbi = container_of(shrink,
index f190dfe969dacc59e34d15f03e2a7c5e3c5ef5e2..d8e2d4dc311e62c99843fa16c7fc8d4a0798bddb 100644 (file)
 #define es_debug(fmt, ...)     no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
 /*
  * These flags live in the high bits of extent_status.es_pblk
  */
@@ -33,6 +39,8 @@
                                 EXTENT_STATUS_DELAYED | \
                                 EXTENT_STATUS_HOLE)
 
+struct ext4_extent;
+
 struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;    /* first logical block extent covers */
@@ -58,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
                                        struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 struct extent_status *es);
+extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
 
 static inline int ext4_es_is_written(struct extent_status *es)
 {
index 32fd2b9075dd4029925d11adb0b84df1a399a8b1..6c5bb8d993fe8ebb07ae48dab2697e247c78ea01 100644 (file)
@@ -324,8 +324,8 @@ error_return:
 }
 
 struct orlov_stats {
+       __u64 free_clusters;
        __u32 free_inodes;
-       __u32 free_clusters;
        __u32 used_dirs;
 };
 
@@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 
        if (flex_size > 1) {
                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-               stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
+               stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
                return;
        }
index 9ea0cde3fa9e0ffe7aebc28940293c422ae75a63..b3a5213bc73eac2082cbfa16bad66bba89c32ed4 100644 (file)
@@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode)
 
        trace_ext4_evict_inode(inode);
 
-       ext4_ioend_wait(inode);
-
        if (inode->i_nlink) {
                /*
                 * When journalling data dirty buffers are tracked only in the
@@ -207,7 +205,8 @@ void ext4_evict_inode(struct inode *inode)
                 * don't use page cache.
                 */
                if (ext4_should_journal_data(inode) &&
-                   (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+                   (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+                   inode->i_ino != EXT4_JOURNAL_INO) {
                        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
                        tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 
@@ -216,6 +215,7 @@ void ext4_evict_inode(struct inode *inode)
                        filemap_write_and_wait(&inode->i_data);
                }
                truncate_inode_pages(&inode->i_data, 0);
+               ext4_ioend_shutdown(inode);
                goto no_delete;
        }
 
@@ -225,6 +225,7 @@ void ext4_evict_inode(struct inode *inode)
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages(&inode->i_data, 0);
+       ext4_ioend_shutdown(inode);
 
        if (is_bad_inode(inode))
                goto no_delete;
@@ -482,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
        return num;
 }
 
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+                                      struct inode *inode,
+                                      struct ext4_map_blocks *es_map,
+                                      struct ext4_map_blocks *map,
+                                      int flags)
+{
+       int retval;
+
+       map->m_flags = 0;
+       /*
+        * There is a race window that the result is not the same.
+        * e.g. xfstests #223 when dioread_nolock enables.  The reason
+        * is that we lookup a block mapping in extent status tree with
+        * out taking i_data_sem.  So at the time the unwritten extent
+        * could be converted.
+        */
+       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+               down_read((&EXT4_I(inode)->i_data_sem));
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               retval = ext4_ext_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
+       } else {
+               retval = ext4_ind_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
+       }
+       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+               up_read((&EXT4_I(inode)->i_data_sem));
+       /*
+        * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+        * because it shouldn't be marked in es_map->m_flags.
+        */
+       map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
+
+       /*
+        * We don't check m_len because extent will be collpased in status
+        * tree.  So the m_len might not equal.
+        */
+       if (es_map->m_lblk != map->m_lblk ||
+           es_map->m_flags != map->m_flags ||
+           es_map->m_pblk != map->m_pblk) {
+               printk("ES cache assertation failed for inode: %lu "
+                      "es_cached ex [%d/%d/%llu/%x] != "
+                      "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+                      inode->i_ino, es_map->m_lblk, es_map->m_len,
+                      es_map->m_pblk, es_map->m_flags, map->m_lblk,
+                      map->m_len, map->m_pblk, map->m_flags,
+                      retval, flags);
+       }
+}
+#endif /* ES_AGGRESSIVE_TEST */
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -509,6 +562,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct extent_status es;
        int retval;
+#ifdef ES_AGGRESSIVE_TEST
+       struct ext4_map_blocks orig_map;
+
+       memcpy(&orig_map, map, sizeof(*map));
+#endif
 
        map->m_flags = 0;
        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
@@ -531,6 +589,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                } else {
                        BUG_ON(1);
                }
+#ifdef ES_AGGRESSIVE_TEST
+               ext4_map_blocks_es_recheck(handle, inode, map,
+                                          &orig_map, flags);
+#endif
                goto found;
        }
 
@@ -551,6 +613,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                int ret;
                unsigned long long status;
 
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (lookup)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
+               }
+#endif
+
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
@@ -643,6 +714,24 @@ found:
                int ret;
                unsigned long long status;
 
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (allocation)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
+               }
+#endif
+
+               /*
+                * If the extent has been zeroed out, we don't need to update
+                * extent status tree.
+                */
+               if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+                   ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+                       if (ext4_es_is_written(&es))
+                               goto has_zeroout;
+               }
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
@@ -655,6 +744,7 @@ found:
                        retval = ret;
        }
 
+has_zeroout:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                int ret = check_block_validity(inode, map);
@@ -1215,6 +1305,55 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
 
+/*
+ * Reserve a metadata for a single block located at lblock
+ */
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
+{
+       int retries = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned int md_needed;
+       ext4_lblk_t save_last_lblock;
+       int save_len;
+
+       /*
+        * recalculate the amount of metadata blocks to reserve
+        * in order to allocate nrblocks
+        * worse case is one extent per block
+        */
+repeat:
+       spin_lock(&ei->i_block_reservation_lock);
+       /*
+        * ext4_calc_metadata_amount() has side effects, which we have
+        * to be prepared undo if we fail to claim space.
+        */
+       save_len = ei->i_da_metadata_calc_len;
+       save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+       md_needed = EXT4_NUM_B2C(sbi,
+                                ext4_calc_metadata_amount(inode, lblock));
+       trace_ext4_da_reserve_space(inode, md_needed);
+
+       /*
+        * We do still charge estimated metadata to the sb though;
+        * we cannot afford to run out of free blocks.
+        */
+       if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+               ei->i_da_metadata_calc_len = save_len;
+               ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+               spin_unlock(&ei->i_block_reservation_lock);
+               if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                       cond_resched();
+                       goto repeat;
+               }
+               return -ENOSPC;
+       }
+       ei->i_reserved_meta_blocks += md_needed;
+       spin_unlock(&ei->i_block_reservation_lock);
+
+       return 0;       /* success */
+}
+
 /*
  * Reserve a single cluster located at lblock
  */
@@ -1263,7 +1402,7 @@ repeat:
                ei->i_da_metadata_calc_last_lblock = save_last_lblock;
                spin_unlock(&ei->i_block_reservation_lock);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-                       yield();
+                       cond_resched();
                        goto repeat;
                }
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
@@ -1768,6 +1907,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+       struct ext4_map_blocks orig_map;
+
+       memcpy(&orig_map, map, sizeof(*map));
+#endif
 
        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;
@@ -1809,6 +1953,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                else
                        BUG_ON(1);
 
+#ifdef ES_AGGRESSIVE_TEST
+               ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
                return retval;
        }
 
@@ -1843,8 +1990,11 @@ add_delayed:
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
                 */
-               /* If the block was allocated from previously allocated cluster,
-                * then we dont need to reserve it again. */
+               /*
+                * If the block was allocated from previously allocated cluster,
+                * then we don't need to reserve it again. However we still need
+                * to reserve metadata for every block we're going to write.
+                */
                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
                        ret = ext4_da_reserve_space(inode, iblock);
                        if (ret) {
@@ -1852,6 +2002,13 @@ add_delayed:
                                retval = ret;
                                goto out_unlock;
                        }
+               } else {
+                       ret = ext4_da_reserve_metadata(inode, iblock);
+                       if (ret) {
+                               /* not enough space to reserve */
+                               retval = ret;
+                               goto out_unlock;
+                       }
                }
 
                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -1873,6 +2030,15 @@ add_delayed:
                int ret;
                unsigned long long status;
 
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (lookup)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
+               }
+#endif
+
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -2908,8 +3074,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 
        trace_ext4_releasepage(page);
 
-       WARN_ON(PageChecked(page));
-       if (!page_has_buffers(page))
+       /* Page has dirty journalled data -> cannot release */
+       if (PageChecked(page))
                return 0;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, page, wait);
index 7bb713a46fe4c3f85ae9d35bd15bea73aa7957f5..ee6614bdb63950b7481443f944ca3c87681903be 100644 (file)
@@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
-               atomic_sub(ac->ac_b_ex.fe_len,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_sub(ac->ac_b_ex.fe_len,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3692,11 +3692,7 @@ repeat:
        if (free < needed && busy) {
                busy = 0;
                ext4_unlock_group(sb, group);
-               /*
-                * Yield the CPU here so that we don't get soft lockup
-                * in non preempt case.
-                */
-               yield();
+               cond_resched();
                goto repeat;
        }
 
@@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
 
                        /* let others to free the space */
-                       yield();
+                       cond_resched();
                        ar->len = ar->len >> 1;
                }
                if (!ar->len) {
@@ -4464,7 +4460,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        struct buffer_head *bitmap_bh = NULL;
        struct super_block *sb = inode->i_sb;
        struct ext4_group_desc *gdp;
-       unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
@@ -4666,14 +4661,12 @@ do_more:
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(count_clusters,
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(count_clusters,
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);
 
-       freed += count;
-
        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
 
@@ -4811,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(EXT4_NUM_B2C(sbi, blocks_freed),
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+                            &sbi->s_flex_groups[flex_group].free_clusters);
        }
 
        ext4_mb_unload_buddy(&e4b);
index 4e81d47aa8cb8af0553b93c1cb7dcbbea2d5d0a8..33e1c086858b54ad704e80345c92b946ae3c565b 100644 (file)
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-               struct ext4_ext_path **path)
+               struct ext4_ext_path **orig_path)
 {
        int ret = 0;
+       struct ext4_ext_path *path;
 
-       *path = ext4_ext_find_extent(inode, lblock, *path);
-       if (IS_ERR(*path)) {
-               ret = PTR_ERR(*path);
-               *path = NULL;
-       } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+       path = ext4_ext_find_extent(inode, lblock, *orig_path);
+       if (IS_ERR(path))
+               ret = PTR_ERR(path);
+       else if (path[ext_depth(inode)].p_ext == NULL)
                ret = -ENODATA;
+       else
+               *orig_path = path;
 
        return ret;
 }
@@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ext;
+       int ret = 0;
        ext4_lblk_t last = from + count;
        while (from < last) {
                *err = get_ext_path(inode, from, &path);
                if (*err)
-                       return 0;
+                       goto out;
                ext = path[ext_depth(inode)].p_ext;
-               if (!ext) {
-                       ext4_ext_drop_refs(path);
-                       return 0;
-               }
-               if (uninit != ext4_ext_is_uninitialized(ext)) {
-                       ext4_ext_drop_refs(path);
-                       return 0;
-               }
+               if (uninit != ext4_ext_is_uninitialized(ext))
+                       goto out;
                from += ext4_ext_get_actual_len(ext);
                ext4_ext_drop_refs(path);
        }
-       return 1;
+       ret = 1;
+out:
+       if (path) {
+               ext4_ext_drop_refs(path);
+               kfree(path);
+       }
+       return ret;
 }
 
 /**
@@ -666,6 +669,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        int replaced_count = 0;
        int dext_alen;
 
+       *err = ext4_es_remove_extent(orig_inode, from, count);
+       if (*err)
+               goto out;
+
+       *err = ext4_es_remove_extent(donor_inode, from, count);
+       if (*err)
+               goto out;
+
        /* Get the original extent for the block "orig_off" */
        *err = get_ext_path(orig_inode, orig_off, &orig_path);
        if (*err)
index 809b31003ecc0fedd35d71e8abed4047bbe18beb..047a6de04a0ac8195d5453de514a8e95526fcce4 100644 (file)
@@ -50,11 +50,21 @@ void ext4_exit_pageio(void)
        kmem_cache_destroy(io_page_cachep);
 }
 
-void ext4_ioend_wait(struct inode *inode)
+/*
+ * This function is called by ext4_evict_inode() to make sure there is
+ * no more pending I/O completion work left to do.
+ */
+void ext4_ioend_shutdown(struct inode *inode)
 {
        wait_queue_head_t *wq = ext4_ioend_wq(inode);
 
        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+       /*
+        * We need to make sure the work structure is finished being
+        * used before we let the inode get destroyed.
+        */
+       if (work_pending(&EXT4_I(inode)->i_unwritten_work))
+               cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
 }
 
 static void put_io_page(struct ext4_io_page *io_page)
index b2c8ee56eb98744ab7481e4d0849e2fb5ca71365..c169477a62c987a1ba2d3362dde6bd8d3cd0f260 100644 (file)
@@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb,
            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, group_data[0].group);
-               atomic_add(EXT4_NUM_B2C(sbi, free_blocks),
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+                            &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
index b3818b48f418577ad5a7abeb979e1b7d1acd836b..5d6d53578124dda01132a6545100a5acb2025f73 100644 (file)
@@ -1927,8 +1927,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
                flex_group = ext4_flex_group(sbi, i);
                atomic_add(ext4_free_inodes_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].free_inodes);
-               atomic_add(ext4_free_group_clusters(sb, gdp),
-                          &sbi->s_flex_groups[flex_group].free_clusters);
+               atomic64_add(ext4_free_group_clusters(sb, gdp),
+                            &sbi->s_flex_groups[flex_group].free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp),
                           &sbi->s_flex_groups[flex_group].used_dirs);
        }
index d6ee5aed56b178ef98850b3a9abc75f6afc54f7f..325bc019ed8813ea00321594405e86c739dad5fb 100644 (file)
@@ -1065,9 +1065,12 @@ out:
 void jbd2_journal_set_triggers(struct buffer_head *bh,
                               struct jbd2_buffer_trigger_type *type)
 {
-       struct journal_head *jh = bh2jh(bh);
+       struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
 
+       if (WARN_ON(!jh))
+               return;
        jh->b_triggers = type;
+       jbd2_journal_put_journal_head(jh);
 }
 
 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
@@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-       struct journal_head *jh = bh2jh(bh);
+       struct journal_head *jh;
        int ret = 0;
 
-       jbd_debug(5, "journal_head %p\n", jh);
-       JBUFFER_TRACE(jh, "entry");
        if (is_handle_aborted(handle))
                goto out;
-       if (!buffer_jbd(bh)) {
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh) {
                ret = -EUCLEAN;
                goto out;
        }
+       jbd_debug(5, "journal_head %p\n", jh);
+       JBUFFER_TRACE(jh, "entry");
 
        jbd_lock_bh_state(bh);
 
@@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
        jbd_unlock_bh_state(bh);
+       jbd2_journal_put_journal_head(jh);
 out:
        JBUFFER_TRACE(jh, "exit");
        WARN_ON(ret);   /* All errors are bugs, so dump the stack */