ext4: Turn off multiple page-io submission by default
authorTheodore Ts'o <tytso@mit.edu>
Tue, 14 Dec 2010 20:27:50 +0000 (15:27 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Tue, 14 Dec 2010 20:27:50 +0000 (15:27 -0500)
Jon Nelson has found a test case which causes postgresql to fail with
the error:

psql:t.sql:4: ERROR: invalid page header in block 38269 of relation base/16384/16581

Under memory pressure, it looks like part of a file can end up getting
replaced by zero's.  Until we can figure out the cause, we'll roll
back the change and use block_write_full_page() instead of
ext4_bio_write_page().  The new, more efficient writing function can
be used via the mount option mblk_io_submit, so we can test and fix
the new page I/O code.

To reproduce the problem, install postgres 8.4 or 9.0, and pin enough
memory such that the system just at the end of triggering writeback
before running the following sql script:

begin;
create temporary table foo as select x as a, ARRAY[x] as b FROM
generate_series(1, 10000000 ) AS x;
create index foo_a_idx on foo (a);
create index foo_b_idx on foo USING GIN (b);
rollback;

If the temporary table is created on a hard drive partition which is
encrypted using dm_crypt, then under memory pressure, approximately
30-40% of the time, pgsql will issue the above failure.

This patch should fix this problem, and the problem will come back if
the file system is mounted with the mblk_io_submit mount option.

Reported-by: Jon Nelson <jnelson@jamponi.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/super.c

index 6a5edea2d70b3ac7c56e8b272686b5a799eabbcf..94ce3d7a1c4b9fe83b42954d309c8ac93b284ff5 100644 (file)
@@ -910,6 +910,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT      0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC            0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT      0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY      0x20000000 /* Block validity checking */
index bdbe69902207c151df025d98690c4c016cf0b0e9..e659597b690b508e1325182de702031a73843f4d 100644 (file)
@@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         */
                        if (unlikely(journal_data && PageChecked(page)))
                                err = __ext4_journalled_writepage(page, len);
-                       else
+                       else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
+                       else
+                               err = block_write_full_page(page,
+                                       noalloc_get_block_write, mpd->wbc);
 
                        if (!err)
                                mpd->pages_written++;
index e32195d6aac34656b13a5619c3750e8b30586c08..fb15c9c0be74d5c06ba421f04449d981a69337ed 100644 (file)
@@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
 
+       if (test_opt(sb, MBLK_IO_SUBMIT))
+               seq_puts(seq, ",mblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1239,8 +1241,8 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
-       Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-       Opt_block_validity, Opt_noblock_validity,
+       Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+       Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard,
@@ -1304,6 +1306,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+       {Opt_mblk_io_submit, "mblk_io_submit"},
+       {Opt_nomblk_io_submit, "nomblk_io_submit"},
        {Opt_block_validity, "block_validity"},
        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1725,6 +1729,12 @@ set_qf_format:
                case Opt_nodelalloc:
                        clear_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+               case Opt_mblk_io_submit:
+                       set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                       break;
+               case Opt_nomblk_io_submit:
+                       clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT);
+                       break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
                                return 0;